Line data Source code
1 : /*
2 : * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3 : * Licensed under the Apache License, Version 2.0 (the "License");
4 : * you may not use this file except in compliance with the License.
5 : * You may obtain a copy of the License at:
6 : *
7 : * http://www.apache.org/licenses/LICENSE-2.0
8 : *
9 : * Unless required by applicable law or agreed to in writing, software
10 : * distributed under the License is distributed on an "AS IS" BASIS,
11 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : * See the License for the specific language governing permissions and
13 : * limitations under the License.
14 : */
15 :
16 : #include <vnet/tcp/tcp.h>
17 : #include <vnet/tcp/tcp_inlines.h>
18 : #include <math.h>
19 : #include <vnet/ip/ip4_inlines.h>
20 : #include <vnet/ip/ip6_inlines.h>
21 :
22 : typedef enum _tcp_output_next
23 : {
24 : TCP_OUTPUT_NEXT_DROP,
25 : TCP_OUTPUT_NEXT_IP_LOOKUP,
26 : TCP_OUTPUT_NEXT_IP_REWRITE,
27 : TCP_OUTPUT_NEXT_IP_ARP,
28 : TCP_OUTPUT_N_NEXT
29 : } tcp_output_next_t;
30 :
31 : #define foreach_tcp4_output_next \
32 : _ (DROP, "error-drop") \
33 : _ (IP_LOOKUP, "ip4-lookup") \
34 : _ (IP_REWRITE, "ip4-rewrite") \
35 : _ (IP_ARP, "ip4-arp")
36 :
37 : #define foreach_tcp6_output_next \
38 : _ (DROP, "error-drop") \
39 : _ (IP_LOOKUP, "ip6-lookup") \
40 : _ (IP_REWRITE, "ip6-rewrite") \
41 : _ (IP_ARP, "ip6-discover-neighbor")
42 :
43 : static vlib_error_desc_t tcp_output_error_counters[] = {
44 : #define tcp_error(f, n, s, d) { #n, d, VL_COUNTER_SEVERITY_##s },
45 : #include <vnet/tcp/tcp_error.def>
46 : #undef tcp_error
47 : };
48 :
49 : typedef struct
50 : {
51 : tcp_header_t tcp_header;
52 : tcp_connection_t tcp_connection;
53 : } tcp_tx_trace_t;
54 :
55 : static u8 *
56 0 : format_tcp_tx_trace (u8 * s, va_list * args)
57 : {
58 0 : CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
59 0 : CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
60 0 : tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *);
61 0 : tcp_connection_t *tc = &t->tcp_connection;
62 0 : u32 indent = format_get_indent (s);
63 :
64 0 : s = format (s, "%U state %U\n%U%U", format_tcp_connection_id, tc,
65 0 : format_tcp_state, tc->state, format_white_space, indent,
66 : format_tcp_header, &t->tcp_header, 128);
67 :
68 0 : return s;
69 : }
70 :
71 : #ifndef CLIB_MARCH_VARIANT
72 : static u8
73 264 : tcp_window_compute_scale (u32 window)
74 : {
75 264 : u8 wnd_scale = 0;
76 2904 : while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
77 2640 : wnd_scale++;
78 264 : return wnd_scale;
79 : }
80 :
81 : /**
82 : * TCP's initial window
83 : */
84 : always_inline u32
85 279 : tcp_initial_wnd_unscaled (tcp_connection_t * tc)
86 : {
87 : /* RFC 6928 recommends the value lower. However at the time our connections
88 : * are initialized, fifos may not be allocated. Therefore, advertise the
89 : * smallest possible unscaled window size and update once fifos are
90 : * assigned to the session.
91 : */
92 : /*
93 : tcp_update_rcv_mss (tc);
94 : TCP_IW_N_SEGMENTS * tc->mss;
95 : */
96 279 : return tcp_cfg.min_rx_fifo;
97 : }
98 :
99 : /**
100 : * Compute initial window and scale factor. As per RFC1323, window field in
101 : * SYN and SYN-ACK segments is never scaled.
102 : */
103 : u32
104 279 : tcp_initial_window_to_advertise (tcp_connection_t * tc)
105 : {
106 : /* Compute rcv wscale only if peer advertised support for it */
107 279 : if (tc->state != TCP_STATE_SYN_RCVD || tcp_opts_wscale (&tc->rcv_opts))
108 264 : tc->rcv_wscale = tcp_window_compute_scale (tcp_cfg.max_rx_fifo);
109 :
110 279 : tc->rcv_wnd = tcp_initial_wnd_unscaled (tc);
111 :
112 279 : return clib_min (tc->rcv_wnd, TCP_WND_MAX);
113 : }
114 :
115 : static inline void
116 116667 : tcp_update_rcv_wnd (tcp_connection_t * tc)
117 : {
118 : u32 available_space, wnd;
119 : i32 observed_wnd;
120 :
121 : /*
122 : * Figure out how much space we have available
123 : */
124 116667 : available_space = transport_max_rx_enqueue (&tc->connection);
125 :
126 : /*
127 : * Use the above and what we know about what we've previously advertised
128 : * to compute the new window
129 : */
130 116667 : observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
131 :
132 : /* Check if we are about to retract the window. Do the comparison before
133 : * rounding to avoid errors. Per RFC7323 sec. 2.4 we could remove this */
134 116667 : if (PREDICT_FALSE ((i32) available_space < observed_wnd))
135 : {
136 0 : wnd = round_down_pow2 (clib_max (observed_wnd, 0), 1 << tc->rcv_wscale);
137 : TCP_EVT (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
138 : }
139 : else
140 : {
141 : /* Make sure we have a multiple of 1 << rcv_wscale. We round down to
142 : * avoid advertising a window larger than what can be buffered */
143 116667 : wnd = round_down_pow2 (available_space, 1 << tc->rcv_wscale);
144 : }
145 :
146 116667 : if (PREDICT_FALSE (wnd < tc->rcv_opts.mss))
147 0 : wnd = 0;
148 :
149 116667 : tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
150 116667 : }
151 :
152 : /**
153 : * Compute and return window to advertise, scaled as per RFC1323
154 : */
155 : static inline u32
156 33064 : tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state)
157 : {
158 33064 : if (state < TCP_STATE_ESTABLISHED)
159 0 : return tcp_initial_window_to_advertise (tc);
160 :
161 33064 : tcp_update_rcv_wnd (tc);
162 33064 : return tc->rcv_wnd >> tc->rcv_wscale;
163 : }
164 :
165 : static int
166 132 : tcp_make_syn_options (tcp_connection_t * tc, tcp_options_t * opts)
167 : {
168 132 : u8 len = 0;
169 :
170 132 : opts->flags |= TCP_OPTS_FLAG_MSS;
171 132 : opts->mss = tc->mss;
172 132 : len += TCP_OPTION_LEN_MSS;
173 :
174 132 : opts->flags |= TCP_OPTS_FLAG_WSCALE;
175 132 : opts->wscale = tc->rcv_wscale;
176 132 : len += TCP_OPTION_LEN_WINDOW_SCALE;
177 :
178 132 : opts->flags |= TCP_OPTS_FLAG_TSTAMP;
179 132 : opts->tsval = tcp_time_tstamp (tc->c_thread_index);
180 132 : opts->tsecr = 0;
181 132 : len += TCP_OPTION_LEN_TIMESTAMP;
182 :
183 : if (TCP_USE_SACKS)
184 : {
185 132 : opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
186 132 : len += TCP_OPTION_LEN_SACK_PERMITTED;
187 : }
188 :
189 : /* Align to needed boundary */
190 132 : len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
191 132 : return len;
192 : }
193 :
194 : static int
195 147 : tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts)
196 : {
197 147 : u8 len = 0;
198 :
199 147 : opts->flags |= TCP_OPTS_FLAG_MSS;
200 147 : opts->mss = tc->mss;
201 147 : len += TCP_OPTION_LEN_MSS;
202 :
203 147 : if (tcp_opts_wscale (&tc->rcv_opts))
204 : {
205 132 : opts->flags |= TCP_OPTS_FLAG_WSCALE;
206 132 : opts->wscale = tc->rcv_wscale;
207 132 : len += TCP_OPTION_LEN_WINDOW_SCALE;
208 : }
209 :
210 147 : if (tcp_opts_tstamp (&tc->rcv_opts))
211 : {
212 132 : opts->flags |= TCP_OPTS_FLAG_TSTAMP;
213 132 : opts->tsval = tcp_time_tstamp (tc->c_thread_index);
214 132 : opts->tsecr = tc->tsval_recent;
215 132 : len += TCP_OPTION_LEN_TIMESTAMP;
216 : }
217 :
218 147 : if (tcp_opts_sack_permitted (&tc->rcv_opts))
219 : {
220 132 : opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
221 132 : len += TCP_OPTION_LEN_SACK_PERMITTED;
222 : }
223 :
224 : /* Align to needed boundary */
225 147 : len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
226 147 : return len;
227 : }
228 :
229 : static int
230 116673 : tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts)
231 : {
232 116673 : u8 len = 0;
233 :
234 116673 : opts->flags = 0;
235 :
236 116673 : if (tcp_opts_tstamp (&tc->rcv_opts))
237 : {
238 116673 : opts->flags |= TCP_OPTS_FLAG_TSTAMP;
239 116673 : opts->tsval = tcp_tstamp (tc);
240 116673 : opts->tsecr = tc->tsval_recent;
241 116673 : len += TCP_OPTION_LEN_TIMESTAMP;
242 : }
243 116673 : if (tcp_opts_sack_permitted (&tc->rcv_opts))
244 : {
245 116673 : if (vec_len (tc->snd_sacks))
246 : {
247 0 : opts->flags |= TCP_OPTS_FLAG_SACK;
248 0 : if (tc->snd_sack_pos >= vec_len (tc->snd_sacks))
249 0 : tc->snd_sack_pos = 0;
250 0 : opts->sacks = &tc->snd_sacks[tc->snd_sack_pos];
251 0 : opts->n_sack_blocks = vec_len (tc->snd_sacks) - tc->snd_sack_pos;
252 0 : opts->n_sack_blocks = clib_min (opts->n_sack_blocks,
253 : TCP_OPTS_MAX_SACK_BLOCKS);
254 0 : tc->snd_sack_pos += opts->n_sack_blocks;
255 0 : len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks;
256 : }
257 : }
258 :
259 : /* Align to needed boundary */
260 116673 : len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
261 116673 : return len;
262 : }
263 :
264 : always_inline int
265 83609 : tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
266 : tcp_state_t state)
267 : {
268 83609 : switch (state)
269 : {
270 83609 : case TCP_STATE_ESTABLISHED:
271 : case TCP_STATE_CLOSE_WAIT:
272 : case TCP_STATE_FIN_WAIT_1:
273 : case TCP_STATE_LAST_ACK:
274 : case TCP_STATE_CLOSING:
275 : case TCP_STATE_FIN_WAIT_2:
276 : case TCP_STATE_TIME_WAIT:
277 : case TCP_STATE_CLOSED:
278 83609 : return tcp_make_established_options (tc, opts);
279 0 : case TCP_STATE_SYN_RCVD:
280 0 : return tcp_make_synack_options (tc, opts);
281 0 : case TCP_STATE_SYN_SENT:
282 0 : return tcp_make_syn_options (tc, opts);
283 0 : default:
284 0 : clib_warning ("State not handled! %d", state);
285 0 : return 0;
286 : }
287 : }
288 :
289 : /**
290 : * Update burst send vars
291 : *
292 : * - Updates snd_mss to reflect the effective segment size that we can send
293 : * by taking into account all TCP options, including SACKs.
294 : * - Cache 'on the wire' options for reuse
295 : * - Updates receive window which can be reused for a burst.
296 : *
297 : * This should *only* be called when doing bursts
298 : */
299 : void
300 83603 : tcp_update_burst_snd_vars (tcp_connection_t * tc)
301 : {
302 83603 : tcp_main_t *tm = &tcp_main;
303 :
304 : /* Compute options to be used for connection. These may be reused when
305 : * sending data or to compute the effective mss (snd_mss) */
306 83603 : tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts,
307 : TCP_STATE_ESTABLISHED);
308 :
309 : /* XXX check if MTU has been updated */
310 83603 : tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len;
311 83603 : ASSERT (tc->snd_mss > 0);
312 :
313 83603 : tcp_options_write (tm->wrk_ctx[tc->c_thread_index].cached_opts,
314 : &tc->snd_opts);
315 :
316 83603 : tcp_update_rcv_wnd (tc);
317 :
318 83603 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
319 0 : tcp_bt_check_app_limited (tc);
320 :
321 83603 : if (tc->snd_una == tc->snd_nxt)
322 : {
323 41927 : tcp_cc_event (tc, TCP_CC_EVT_START_TX);
324 : }
325 :
326 83603 : if (tc->flags & TCP_CONN_PSH_PENDING)
327 : {
328 24620 : u32 max_deq = transport_max_tx_dequeue (&tc->connection);
329 : /* Last byte marked for push */
330 24620 : tc->psh_seq = tc->snd_una + max_deq - 1;
331 : }
332 83603 : }
333 :
334 : static void *
335 33349 : tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
336 : {
337 33349 : ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
338 33349 : b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
339 33349 : b->total_length_not_including_first_buffer = 0;
340 33349 : b->current_data = 0;
341 33349 : vnet_buffer (b)->tcp.flags = 0;
342 : /* Leave enough space for headers */
343 33349 : return vlib_buffer_make_headroom (b, TRANSPORT_MAX_HDRS_LEN);
344 : }
345 :
346 : /* Compute TCP checksum in software when offloading is disabled for a connection */
347 : u16
348 0 : ip6_tcp_compute_checksum_custom (vlib_main_t * vm, vlib_buffer_t * p0,
349 : ip46_address_t * src, ip46_address_t * dst)
350 : {
351 : ip_csum_t sum0;
352 : u16 payload_length_host_byte_order;
353 : u32 i;
354 :
355 : /* Initialize checksum with ip header. */
356 0 : sum0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0)) +
357 0 : clib_host_to_net_u16 (IP_PROTOCOL_TCP);
358 0 : payload_length_host_byte_order = vlib_buffer_length_in_chain (vm, p0);
359 :
360 0 : for (i = 0; i < ARRAY_LEN (src->ip6.as_uword); i++)
361 : {
362 0 : sum0 = ip_csum_with_carry
363 0 : (sum0, clib_mem_unaligned (&src->ip6.as_uword[i], uword));
364 0 : sum0 = ip_csum_with_carry
365 0 : (sum0, clib_mem_unaligned (&dst->ip6.as_uword[i], uword));
366 : }
367 :
368 0 : return ip_calculate_l4_checksum (vm, p0, sum0,
369 : payload_length_host_byte_order, NULL, 0,
370 : NULL);
371 : }
372 :
373 : u16
374 0 : ip4_tcp_compute_checksum_custom (vlib_main_t * vm, vlib_buffer_t * p0,
375 : ip46_address_t * src, ip46_address_t * dst)
376 : {
377 : ip_csum_t sum0;
378 : u32 payload_length_host_byte_order;
379 :
380 0 : payload_length_host_byte_order = vlib_buffer_length_in_chain (vm, p0);
381 0 : sum0 =
382 0 : clib_host_to_net_u32 (payload_length_host_byte_order +
383 : (IP_PROTOCOL_TCP << 16));
384 :
385 0 : sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&src->ip4, u32));
386 0 : sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&dst->ip4, u32));
387 :
388 0 : return ip_calculate_l4_checksum (vm, p0, sum0,
389 : payload_length_host_byte_order, NULL, 0,
390 : NULL);
391 : }
392 :
393 : static inline u16
394 1062440 : tcp_compute_checksum (tcp_connection_t * tc, vlib_buffer_t * b)
395 : {
396 1062440 : u16 checksum = 0;
397 1062440 : if (PREDICT_FALSE (tc->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD))
398 : {
399 0 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
400 0 : vlib_main_t *vm = wrk->vm;
401 :
402 0 : if (tc->c_is_ip4)
403 0 : checksum = ip4_tcp_compute_checksum_custom
404 : (vm, b, &tc->c_lcl_ip, &tc->c_rmt_ip);
405 : else
406 0 : checksum = ip6_tcp_compute_checksum_custom
407 : (vm, b, &tc->c_lcl_ip, &tc->c_rmt_ip);
408 : }
409 : else
410 : {
411 1062440 : vnet_buffer_offload_flags_set (b, VNET_BUFFER_OFFLOAD_F_TCP_CKSUM);
412 : }
413 1062440 : return checksum;
414 : }
415 :
416 : /**
417 : * Prepare ACK
418 : */
419 : static inline void
420 33064 : tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state,
421 : u8 flags)
422 : {
423 33064 : tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
424 : u8 tcp_opts_len, tcp_hdr_opts_len;
425 : tcp_header_t *th;
426 : u16 wnd;
427 :
428 33064 : wnd = tcp_window_to_advertise (tc, state);
429 :
430 : /* Make and write options */
431 33064 : tcp_opts_len = tcp_make_established_options (tc, snd_opts);
432 33064 : tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
433 :
434 33064 : th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
435 : tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd);
436 :
437 33064 : tcp_options_write ((u8 *) (th + 1), snd_opts);
438 :
439 33064 : th->checksum = tcp_compute_checksum (tc, b);
440 :
441 33064 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
442 :
443 33064 : if (wnd == 0)
444 : {
445 0 : transport_rx_fifo_req_deq_ntf (&tc->connection);
446 0 : tcp_zero_rwnd_sent_on (tc);
447 : }
448 : else
449 33064 : tcp_zero_rwnd_sent_off (tc);
450 33064 : }
451 :
452 : /**
453 : * Convert buffer to ACK
454 : */
455 : static inline void
456 32808 : tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b)
457 : {
458 32808 : tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
459 : TCP_EVT (TCP_EVT_ACK_SENT, tc);
460 32808 : tc->rcv_las = tc->rcv_nxt;
461 32808 : }
462 :
463 : /**
464 : * Convert buffer to FIN-ACK
465 : */
466 : static void
467 256 : tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b)
468 : {
469 256 : tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK);
470 256 : }
471 :
472 : /**
473 : * Convert buffer to SYN
474 : */
475 : void
476 132 : tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b)
477 : {
478 : u8 tcp_hdr_opts_len, tcp_opts_len;
479 : tcp_header_t *th;
480 : u16 initial_wnd;
481 : tcp_options_t snd_opts;
482 :
483 132 : initial_wnd = tcp_initial_window_to_advertise (tc);
484 :
485 : /* Make and write options */
486 132 : clib_memset (&snd_opts, 0, sizeof (snd_opts));
487 132 : tcp_opts_len = tcp_make_syn_options (tc, &snd_opts);
488 132 : tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
489 :
490 132 : th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
491 : tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
492 : initial_wnd);
493 132 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
494 132 : tcp_options_write ((u8 *) (th + 1), &snd_opts);
495 132 : th->checksum = tcp_compute_checksum (tc, b);
496 132 : }
497 :
498 : /**
499 : * Convert buffer to SYN-ACK
500 : */
501 : static void
502 147 : tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
503 : {
504 147 : tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
505 : u8 tcp_opts_len, tcp_hdr_opts_len;
506 : tcp_header_t *th;
507 : u16 initial_wnd;
508 :
509 147 : clib_memset (snd_opts, 0, sizeof (*snd_opts));
510 147 : initial_wnd = tcp_initial_window_to_advertise (tc);
511 147 : tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
512 147 : tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
513 :
514 147 : th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
515 : tc->rcv_nxt, tcp_hdr_opts_len,
516 : TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
517 147 : tcp_options_write ((u8 *) (th + 1), snd_opts);
518 :
519 147 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
520 147 : th->checksum = tcp_compute_checksum (tc, b);
521 147 : }
522 :
523 : static void
524 132 : tcp_enqueue_half_open (tcp_worker_ctx_t *wrk, tcp_connection_t *tc,
525 : vlib_buffer_t *b, u32 bi)
526 : {
527 132 : vlib_main_t *vm = wrk->vm;
528 :
529 132 : b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
530 132 : b->error = 0;
531 :
532 132 : session_add_pending_tx_buffer (vm->thread_index, bi,
533 132 : wrk->tco_next_node[!tc->c_is_ip4]);
534 :
535 132 : if (vm->thread_index == 0 && vlib_num_workers ())
536 0 : session_queue_run_on_main_thread (vm);
537 132 : }
538 :
539 : static void
540 33217 : tcp_enqueue_to_output (tcp_worker_ctx_t * wrk, vlib_buffer_t * b, u32 bi,
541 : u8 is_ip4)
542 : {
543 33217 : b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
544 33217 : b->error = 0;
545 :
546 33217 : session_add_pending_tx_buffer (wrk->vm->thread_index, bi,
547 33217 : wrk->tco_next_node[!is_ip4]);
548 33217 : }
549 :
550 : int
551 86 : tcp_buffer_make_reset (vlib_main_t *vm, vlib_buffer_t *b, u8 is_ip4)
552 : {
553 86 : ip4_address_t src_ip4 = {}, dst_ip4 = {};
554 : ip6_address_t src_ip6, dst_ip6;
555 : u16 src_port, dst_port;
556 : u32 tmp, len, seq, ack;
557 : ip4_header_t *ih4;
558 : ip6_header_t *ih6;
559 : tcp_header_t *th;
560 : u8 flags;
561 :
562 : /*
563 : * Find IP and TCP headers and glean information from them. Assumes
564 : * buffer was parsed by something like @ref tcp_input_lookup_buffer
565 : */
566 86 : th = tcp_buffer_hdr (b);
567 :
568 86 : if (is_ip4)
569 : {
570 86 : ih4 = vlib_buffer_get_current (b);
571 86 : ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40);
572 86 : src_ip4.as_u32 = ih4->src_address.as_u32;
573 86 : dst_ip4.as_u32 = ih4->dst_address.as_u32;
574 : }
575 : else
576 : {
577 0 : ih6 = vlib_buffer_get_current (b);
578 0 : ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60);
579 0 : clib_memcpy_fast (&src_ip6, &ih6->src_address, sizeof (ip6_address_t));
580 0 : clib_memcpy_fast (&dst_ip6, &ih6->dst_address, sizeof (ip6_address_t));
581 : }
582 :
583 86 : src_port = th->src_port;
584 86 : dst_port = th->dst_port;
585 86 : flags = TCP_FLAG_RST;
586 :
587 : /*
588 : * RFC 793. If the ACK bit is off, sequence number zero is used,
589 : * <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
590 : * If the ACK bit is on,
591 : * <SEQ=SEG.ACK><CTL=RST>
592 : */
593 86 : if (tcp_ack (th))
594 : {
595 86 : seq = th->ack_number;
596 86 : ack = 0;
597 : }
598 : else
599 : {
600 0 : flags |= TCP_FLAG_ACK;
601 0 : tmp = clib_net_to_host_u32 (th->seq_number);
602 0 : len = vnet_buffer (b)->tcp.data_len + tcp_is_syn (th) + tcp_is_fin (th);
603 0 : ack = clib_host_to_net_u32 (tmp + len);
604 0 : seq = 0;
605 : }
606 :
607 : /*
608 : * Clear and reuse current buffer for reset
609 : */
610 86 : if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
611 0 : vlib_buffer_free_one (vm, b->next_buffer);
612 :
613 : /* Zero all flags but free list index and trace flag */
614 86 : b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
615 : /* Make sure new tcp header comes after current ip */
616 86 : b->current_data = ((u8 *) th - b->data) + sizeof (tcp_header_t);
617 86 : b->current_length = 0;
618 86 : b->total_length_not_including_first_buffer = 0;
619 86 : vnet_buffer (b)->tcp.flags = 0;
620 :
621 : /*
622 : * Add TCP and IP headers
623 : */
624 86 : th = vlib_buffer_push_tcp_net_order (b, dst_port, src_port, seq, ack,
625 : sizeof (tcp_header_t), flags, 0);
626 :
627 86 : if (is_ip4)
628 : {
629 86 : ih4 = vlib_buffer_push_ip4 (vm, b, &dst_ip4, &src_ip4,
630 : IP_PROTOCOL_TCP, 1);
631 86 : th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
632 : }
633 : else
634 : {
635 0 : int bogus = ~0;
636 0 : ih6 = vlib_buffer_push_ip6 (vm, b, &dst_ip6, &src_ip6, IP_PROTOCOL_TCP);
637 0 : th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
638 0 : ASSERT (!bogus);
639 : }
640 :
641 86 : return 0;
642 : }
643 :
644 : /**
645 : * Send reset without reusing existing buffer
646 : *
647 : * It extracts connection info out of original packet
648 : */
649 : void
650 0 : tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt,
651 : u32 thread_index, u8 is_ip4)
652 : {
653 0 : tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
654 0 : vlib_main_t *vm = wrk->vm;
655 : vlib_buffer_t *b;
656 0 : u8 tcp_hdr_len, flags = 0;
657 : tcp_header_t *th, *pkt_th;
658 : u32 seq, ack, bi;
659 : ip4_header_t *ih4, *pkt_ih4;
660 : ip6_header_t *ih6, *pkt_ih6;
661 :
662 0 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
663 : {
664 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
665 0 : return;
666 : }
667 :
668 0 : b = vlib_get_buffer (vm, bi);
669 0 : tcp_init_buffer (vm, b);
670 0 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
671 :
672 : /* Make and write options */
673 0 : tcp_hdr_len = sizeof (tcp_header_t);
674 :
675 0 : if (is_ip4)
676 : {
677 0 : pkt_ih4 = vlib_buffer_get_current (pkt);
678 0 : pkt_th = ip4_next_header (pkt_ih4);
679 : }
680 : else
681 : {
682 0 : pkt_ih6 = vlib_buffer_get_current (pkt);
683 0 : pkt_th = ip6_next_header (pkt_ih6);
684 : }
685 :
686 0 : if (tcp_ack (pkt_th))
687 : {
688 0 : flags = TCP_FLAG_RST;
689 0 : seq = pkt_th->ack_number;
690 0 : ack = (tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0;
691 0 : ack = clib_host_to_net_u32 (ack);
692 : }
693 : else
694 : {
695 0 : flags = TCP_FLAG_RST | TCP_FLAG_ACK;
696 0 : seq = 0;
697 0 : ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end);
698 : }
699 :
700 0 : th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port,
701 : seq, ack, tcp_hdr_len, flags, 0);
702 :
703 : /* Swap src and dst ip */
704 0 : if (is_ip4)
705 : {
706 0 : ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40);
707 0 : ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address,
708 : &pkt_ih4->src_address, IP_PROTOCOL_TCP,
709 0 : tcp_csum_offload (tc));
710 0 : th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
711 : }
712 : else
713 : {
714 0 : int bogus = ~0;
715 0 : ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) ==
716 : 0x60);
717 0 : ih6 = vlib_buffer_push_ip6_custom (vm, b, &pkt_ih6->dst_address,
718 : &pkt_ih6->src_address,
719 : IP_PROTOCOL_TCP,
720 : tc->ipv6_flow_label);
721 0 : th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
722 0 : ASSERT (!bogus);
723 : }
724 :
725 0 : tcp_enqueue_half_open (wrk, tc, b, bi);
726 : TCP_EVT (TCP_EVT_RST_SENT, tc);
727 0 : vlib_node_increment_counter (vm, tcp_node_index (output, tc->c_is_ip4),
728 : TCP_ERROR_RST_SENT, 1);
729 : }
730 :
731 : /**
732 : * Build and set reset packet for connection
733 : */
734 : void
735 6 : tcp_send_reset (tcp_connection_t * tc)
736 : {
737 6 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
738 6 : vlib_main_t *vm = wrk->vm;
739 : vlib_buffer_t *b;
740 : u32 bi;
741 : tcp_header_t *th;
742 : u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len;
743 : u8 flags;
744 :
745 6 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
746 : {
747 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
748 0 : return;
749 : }
750 6 : b = vlib_get_buffer (vm, bi);
751 6 : tcp_init_buffer (vm, b);
752 :
753 6 : tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
754 6 : tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
755 6 : advertise_wnd = tc->rcv_wnd >> tc->rcv_wscale;
756 6 : flags = TCP_FLAG_RST | TCP_FLAG_ACK;
757 6 : th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
758 : tc->rcv_nxt, tcp_hdr_opts_len, flags,
759 : advertise_wnd);
760 6 : opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
761 6 : th->checksum = tcp_compute_checksum (tc, b);
762 6 : ASSERT (opts_write_len == tc->snd_opts_len);
763 6 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
764 6 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
765 : TCP_EVT (TCP_EVT_RST_SENT, tc);
766 6 : vlib_node_increment_counter (vm, tcp_node_index (output, tc->c_is_ip4),
767 : TCP_ERROR_RST_SENT, 1);
768 : }
769 :
770 : /**
771 : * Send SYN
772 : *
773 : * Builds a SYN packet for a half-open connection and sends it to tcp-output.
774 : * The packet is handled by main thread and because half-open and established
775 : * connections use the same pool the connection can be retrieved without
776 : * additional logic.
777 : */
778 : void
779 132 : tcp_send_syn (tcp_connection_t * tc)
780 : {
781 132 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
782 132 : vlib_main_t *vm = wrk->vm;
783 : vlib_buffer_t *b;
784 : u32 bi;
785 :
786 : /*
787 : * Setup retransmit and establish timers before requesting buffer
788 : * such that we can return if we've ran out.
789 : */
790 132 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN,
791 132 : (u32) tc->rto * TCP_TO_TIMER_TICK);
792 :
793 132 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
794 : {
795 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN,
796 : tcp_cfg.alloc_err_timeout);
797 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
798 0 : return;
799 : }
800 :
801 132 : b = vlib_get_buffer (vm, bi);
802 132 : tcp_init_buffer (vm, b);
803 132 : tcp_make_syn (tc, b);
804 :
805 : /* Measure RTT with this */
806 132 : tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0);
807 132 : tc->rtt_seq = tc->snd_nxt;
808 132 : tc->rto_boff = 0;
809 :
810 132 : tcp_enqueue_half_open (wrk, tc, b, bi);
811 : TCP_EVT (TCP_EVT_SYN_SENT, tc);
812 : }
813 :
814 : void
815 135 : tcp_send_synack (tcp_connection_t * tc)
816 : {
817 135 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
818 135 : vlib_main_t *vm = wrk->vm;
819 : vlib_buffer_t *b;
820 : u32 bi;
821 :
822 135 : ASSERT (tc->snd_una != tc->snd_nxt);
823 135 : tcp_retransmit_timer_update (&wrk->timer_wheel, tc);
824 :
825 135 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
826 : {
827 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT,
828 : tcp_cfg.alloc_err_timeout);
829 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
830 0 : return;
831 : }
832 :
833 135 : tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
834 135 : b = vlib_get_buffer (vm, bi);
835 135 : tcp_init_buffer (vm, b);
836 135 : tcp_make_synack (tc, b);
837 135 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
838 : TCP_EVT (TCP_EVT_SYNACK_SENT, tc);
839 : }
840 :
841 : /**
842 : * Send FIN
843 : */
844 : void
845 256 : tcp_send_fin (tcp_connection_t * tc)
846 : {
847 256 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
848 256 : vlib_main_t *vm = wrk->vm;
849 : vlib_buffer_t *b;
850 : u32 bi;
851 256 : u8 fin_snt = 0;
852 :
853 256 : fin_snt = tc->flags & TCP_CONN_FINSNT;
854 256 : if (fin_snt)
855 0 : tc->snd_nxt -= 1;
856 :
857 256 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
858 : {
859 : /* Out of buffers so program fin retransmit ASAP */
860 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT,
861 : tcp_cfg.alloc_err_timeout);
862 0 : if (fin_snt)
863 0 : tc->snd_nxt += 1;
864 : else
865 : /* Make sure retransmit retries a fin not data */
866 0 : tc->flags |= TCP_CONN_FINSNT;
867 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
868 0 : return;
869 : }
870 :
871 : /* If we have non-dupacks programmed, no need to send them */
872 256 : if ((tc->flags & TCP_CONN_SNDACK) && !tc->pending_dupacks)
873 3 : tc->flags &= ~TCP_CONN_SNDACK;
874 :
875 256 : b = vlib_get_buffer (vm, bi);
876 256 : tcp_init_buffer (vm, b);
877 256 : tcp_make_fin (tc, b);
878 256 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
879 : TCP_EVT (TCP_EVT_FIN_SENT, tc);
880 : /* Account for the FIN */
881 256 : tc->snd_nxt += 1;
882 256 : tcp_retransmit_timer_update (&wrk->timer_wheel, tc);
883 256 : if (!fin_snt)
884 : {
885 256 : tc->flags |= TCP_CONN_FINSNT;
886 256 : tc->flags &= ~TCP_CONN_FINPNDG;
887 : }
888 : }
889 :
890 : /**
891 : * Push TCP header and update connection variables. Should only be called
892 : * for segments with data, not for 'control' packets.
893 : */
894 : always_inline void
895 1029090 : tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, u32 snd_nxt,
896 : u8 compute_opts, u8 maybe_burst, u8 update_snd_nxt)
897 : {
898 1029090 : u8 tcp_hdr_opts_len, flags = TCP_FLAG_ACK;
899 : u32 advertise_wnd, data_len;
900 1029090 : tcp_main_t *tm = &tcp_main;
901 : tcp_header_t *th;
902 :
903 1029090 : data_len = b->current_length;
904 1029090 : if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
905 0 : data_len += b->total_length_not_including_first_buffer;
906 :
907 1029090 : vnet_buffer (b)->tcp.flags = 0;
908 1029090 : vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
909 :
910 1029090 : if (compute_opts)
911 0 : tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
912 :
913 1029090 : tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
914 :
915 1029090 : if (maybe_burst)
916 1029090 : advertise_wnd = tc->rcv_wnd >> tc->rcv_wscale;
917 : else
918 0 : advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED);
919 :
920 1029090 : if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
921 : {
922 950988 : if (seq_geq (tc->psh_seq, snd_nxt)
923 950988 : && seq_lt (tc->psh_seq, snd_nxt + data_len))
924 321 : flags |= TCP_FLAG_PSH;
925 : }
926 1029090 : th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, snd_nxt,
927 : tc->rcv_nxt, tcp_hdr_opts_len, flags,
928 : advertise_wnd);
929 :
930 1029090 : if (maybe_burst)
931 : {
932 1029090 : clib_memcpy_fast ((u8 *) (th + 1),
933 1029090 : tm->wrk_ctx[tc->c_thread_index].cached_opts,
934 1029090 : tc->snd_opts_len);
935 : }
936 : else
937 : {
938 0 : u8 len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
939 0 : ASSERT (len == tc->snd_opts_len);
940 : }
941 :
942 : /*
943 : * Update connection variables
944 : */
945 :
946 1029090 : if (update_snd_nxt)
947 1029090 : tc->snd_nxt += data_len;
948 1029090 : tc->rcv_las = tc->rcv_nxt;
949 :
950 1029090 : tc->bytes_out += data_len;
951 1029090 : tc->data_segs_out += 1;
952 :
953 1029090 : th->checksum = tcp_compute_checksum (tc, b);
954 :
955 : TCP_EVT (TCP_EVT_PKTIZE, tc);
956 1029090 : }
957 :
958 : always_inline u32
959 0 : tcp_buffer_len (vlib_buffer_t * b)
960 : {
961 0 : u32 data_len = b->current_length;
962 0 : if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
963 0 : data_len += b->total_length_not_including_first_buffer;
964 0 : return data_len;
965 : }
966 :
967 : always_inline u32
968 1029090 : tcp_push_one_header (tcp_connection_t *tc, vlib_buffer_t *b)
969 : {
970 1029090 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
971 0 : tcp_bt_track_tx (tc, tcp_buffer_len (b));
972 :
973 1029090 : tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
974 : /* update_snd_nxt */ 1);
975 :
976 1029090 : tcp_validate_txf_size (tc, tc->snd_nxt - tc->snd_una);
977 1029090 : return 0;
978 : }
979 :
980 : u32
981 42020 : tcp_session_push_header (transport_connection_t *tconn, vlib_buffer_t **bs,
982 : u32 n_bufs)
983 : {
984 42020 : tcp_connection_t *tc = (tcp_connection_t *) tconn;
985 :
986 511100 : while (n_bufs >= 4)
987 : {
988 469080 : vlib_prefetch_buffer_header (bs[2], STORE);
989 469080 : vlib_prefetch_buffer_header (bs[3], STORE);
990 :
991 469080 : tcp_push_one_header (tc, bs[0]);
992 469080 : tcp_push_one_header (tc, bs[1]);
993 :
994 469080 : n_bufs -= 2;
995 469080 : bs += 2;
996 : }
997 132952 : while (n_bufs)
998 : {
999 90932 : if (n_bufs > 1)
1000 48912 : vlib_prefetch_buffer_header (bs[1], STORE);
1001 :
1002 90932 : tcp_push_one_header (tc, bs[0]);
1003 :
1004 90932 : n_bufs -= 1;
1005 90932 : bs += 1;
1006 : }
1007 :
1008 : /* If not tracking an ACK, start tracking */
1009 42020 : if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
1010 : {
1011 28751 : tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
1012 28751 : tc->rtt_seq = tc->snd_nxt;
1013 : }
1014 42020 : if (PREDICT_FALSE (!tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)))
1015 : {
1016 16347 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1017 16347 : tcp_retransmit_timer_set (&wrk->timer_wheel, tc);
1018 16347 : tc->rto_boff = 0;
1019 : }
1020 42020 : return 0;
1021 : }
1022 :
1023 : void
1024 32808 : tcp_send_ack (tcp_connection_t * tc)
1025 : {
1026 32808 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1027 32808 : vlib_main_t *vm = wrk->vm;
1028 : vlib_buffer_t *b;
1029 : u32 bi;
1030 :
1031 32808 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1032 : {
1033 0 : tcp_update_rcv_wnd (tc);
1034 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1035 0 : return;
1036 : }
1037 32808 : b = vlib_get_buffer (vm, bi);
1038 32808 : tcp_init_buffer (vm, b);
1039 32808 : tcp_make_ack (tc, b);
1040 32808 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1041 : }
1042 :
1043 : void
1044 1029260 : tcp_program_ack (tcp_connection_t * tc)
1045 : {
1046 1029260 : if (!(tc->flags & TCP_CONN_SNDACK))
1047 : {
1048 41829 : session_add_self_custom_tx_evt (&tc->connection, 1);
1049 41829 : tc->flags |= TCP_CONN_SNDACK;
1050 : }
1051 1029260 : }
1052 :
1053 : void
1054 0 : tcp_program_dupack (tcp_connection_t * tc)
1055 : {
1056 0 : if (!(tc->flags & TCP_CONN_SNDACK))
1057 : {
1058 0 : session_add_self_custom_tx_evt (&tc->connection, 1);
1059 0 : tc->flags |= TCP_CONN_SNDACK;
1060 : }
1061 0 : if (tc->pending_dupacks < 255)
1062 0 : tc->pending_dupacks += 1;
1063 0 : }
1064 :
1065 : void
1066 0 : tcp_program_retransmit (tcp_connection_t * tc)
1067 : {
1068 0 : if (!(tc->flags & TCP_CONN_RXT_PENDING))
1069 : {
1070 0 : session_add_self_custom_tx_evt (&tc->connection, 0);
1071 0 : tc->flags |= TCP_CONN_RXT_PENDING;
1072 : }
1073 0 : }
1074 :
1075 : /**
1076 : * Send window update ack
1077 : *
1078 : * Ensures that it will be sent only once, after a zero rwnd has been
1079 : * advertised in a previous ack, and only if rwnd has grown beyond a
1080 : * configurable value.
1081 : */
1082 : void
1083 0 : tcp_send_window_update_ack (tcp_connection_t * tc)
1084 : {
1085 0 : if (tcp_zero_rwnd_sent (tc))
1086 : {
1087 0 : tcp_update_rcv_wnd (tc);
1088 0 : if (tc->rcv_wnd >= tcp_cfg.rwnd_min_update_ack * tc->snd_mss)
1089 : {
1090 0 : tcp_zero_rwnd_sent_off (tc);
1091 0 : tcp_program_ack (tc);
1092 : }
1093 : }
1094 0 : }
1095 :
1096 : /**
1097 : * Allocate a new buffer and build a new tcp segment
1098 : *
1099 : * @param wrk tcp worker
1100 : * @param tc connection for which the segment will be allocated
1101 : * @param offset offset of the first byte in the tx fifo
1102 : * @param max_deq_byte segment size
1103 : * @param[out] b pointer to buffer allocated
1104 : *
1105 : * @return the number of bytes in the segment or 0 if buffer cannot be
1106 : * allocated or no data available
1107 : */
1108 : static int
1109 0 : tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
1110 : u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b)
1111 : {
1112 0 : u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer;
1113 0 : vlib_main_t *vm = wrk->vm;
1114 : u32 bi, seg_size;
1115 0 : int n_bytes = 0;
1116 : u8 *data;
1117 :
1118 0 : seg_size = max_deq_bytes + TRANSPORT_MAX_HDRS_LEN;
1119 :
1120 : /*
1121 : * Prepare options
1122 : */
1123 0 : tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
1124 :
1125 : /*
1126 : * Allocate and fill in buffer(s)
1127 : */
1128 :
1129 : /* Easy case, buffer size greater than mss */
1130 0 : if (PREDICT_TRUE (seg_size <= bytes_per_buffer))
1131 : {
1132 0 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1133 : {
1134 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1135 0 : return 0;
1136 : }
1137 0 : *b = vlib_get_buffer (vm, bi);
1138 0 : data = tcp_init_buffer (vm, *b);
1139 0 : n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1140 : max_deq_bytes);
1141 0 : ASSERT (n_bytes == max_deq_bytes);
1142 0 : b[0]->current_length = n_bytes;
1143 0 : tcp_push_hdr_i (tc, *b, tc->snd_una + offset, /* compute opts */ 0,
1144 : /* burst */ 0, /* update_snd_nxt */ 0);
1145 : }
1146 : /* Split mss into multiple buffers */
1147 : else
1148 : {
1149 0 : u32 chain_bi = ~0, n_bufs_per_seg, n_bufs;
1150 : u16 n_peeked, len_to_deq;
1151 : vlib_buffer_t *chain_b, *prev_b;
1152 : int i;
1153 :
1154 : /* Make sure we have enough buffers */
1155 0 : n_bufs_per_seg = ceil ((double) seg_size / bytes_per_buffer);
1156 0 : vec_validate_aligned (wrk->tx_buffers, n_bufs_per_seg - 1,
1157 : CLIB_CACHE_LINE_BYTES);
1158 0 : n_bufs = vlib_buffer_alloc (vm, wrk->tx_buffers, n_bufs_per_seg);
1159 0 : if (PREDICT_FALSE (n_bufs != n_bufs_per_seg))
1160 : {
1161 0 : if (n_bufs)
1162 0 : vlib_buffer_free (vm, wrk->tx_buffers, n_bufs);
1163 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1164 0 : return 0;
1165 : }
1166 :
1167 0 : *b = vlib_get_buffer (vm, wrk->tx_buffers[--n_bufs]);
1168 0 : data = tcp_init_buffer (vm, *b);
1169 0 : n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1170 : bytes_per_buffer -
1171 : TRANSPORT_MAX_HDRS_LEN);
1172 0 : b[0]->current_length = n_bytes;
1173 0 : b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
1174 0 : b[0]->total_length_not_including_first_buffer = 0;
1175 0 : max_deq_bytes -= n_bytes;
1176 :
1177 0 : chain_b = *b;
1178 0 : for (i = 1; i < n_bufs_per_seg; i++)
1179 : {
1180 0 : prev_b = chain_b;
1181 0 : len_to_deq = clib_min (max_deq_bytes, bytes_per_buffer);
1182 0 : chain_bi = wrk->tx_buffers[--n_bufs];
1183 0 : chain_b = vlib_get_buffer (vm, chain_bi);
1184 0 : chain_b->current_data = 0;
1185 0 : data = vlib_buffer_get_current (chain_b);
1186 0 : n_peeked = session_tx_fifo_peek_bytes (&tc->connection, data,
1187 : offset + n_bytes,
1188 : len_to_deq);
1189 0 : ASSERT (n_peeked == len_to_deq);
1190 0 : n_bytes += n_peeked;
1191 0 : chain_b->current_length = n_peeked;
1192 0 : chain_b->next_buffer = 0;
1193 :
1194 : /* update previous buffer */
1195 0 : prev_b->next_buffer = chain_bi;
1196 0 : prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
1197 :
1198 0 : max_deq_bytes -= n_peeked;
1199 0 : b[0]->total_length_not_including_first_buffer += n_peeked;
1200 : }
1201 :
1202 0 : tcp_push_hdr_i (tc, *b, tc->snd_una + offset, /* compute opts */ 0,
1203 : /* burst */ 0, /* update_snd_nxt */ 0);
1204 :
1205 0 : if (PREDICT_FALSE (n_bufs))
1206 : {
1207 0 : clib_warning ("not all buffers consumed");
1208 0 : vlib_buffer_free (vm, wrk->tx_buffers, n_bufs);
1209 : }
1210 : }
1211 :
1212 0 : ASSERT (n_bytes > 0);
1213 0 : ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer);
1214 :
1215 0 : return n_bytes;
1216 : }
1217 :
1218 : /**
1219 : * Build a retransmit segment
1220 : *
1221 : * @return the number of bytes in the segment or 0 if there's nothing to
1222 : * retransmit
1223 : */
1224 : static u32
1225 0 : tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk,
1226 : tcp_connection_t * tc, u32 offset,
1227 : u32 max_deq_bytes, vlib_buffer_t ** b)
1228 : {
1229 : u32 start, available_bytes;
1230 0 : int n_bytes = 0;
1231 :
1232 0 : ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
1233 0 : ASSERT (max_deq_bytes != 0);
1234 :
1235 : /*
1236 : * Make sure we can retransmit something
1237 : */
1238 0 : available_bytes = transport_max_tx_dequeue (&tc->connection);
1239 0 : ASSERT (available_bytes >= offset);
1240 0 : available_bytes -= offset;
1241 0 : if (!available_bytes)
1242 0 : return 0;
1243 :
1244 0 : max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes);
1245 0 : max_deq_bytes = clib_min (available_bytes, max_deq_bytes);
1246 :
1247 0 : start = tc->snd_una + offset;
1248 0 : ASSERT (seq_leq (start + max_deq_bytes, tc->snd_nxt));
1249 :
1250 0 : n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b);
1251 0 : if (!n_bytes)
1252 0 : return 0;
1253 :
1254 0 : tc->snd_rxt_bytes += n_bytes;
1255 :
1256 0 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1257 0 : tcp_bt_track_rxt (tc, start, start + n_bytes);
1258 :
1259 0 : tc->bytes_retrans += n_bytes;
1260 0 : tc->segs_retrans += 1;
1261 0 : tcp_worker_stats_inc (wrk, rxt_segs, 1);
1262 : TCP_EVT (TCP_EVT_CC_RTX, tc, offset, n_bytes);
1263 :
1264 0 : return n_bytes;
1265 : }
1266 :
1267 : static void
1268 0 : tcp_check_sack_reneging (tcp_connection_t * tc)
1269 : {
1270 0 : sack_scoreboard_t *sb = &tc->sack_sb;
1271 : sack_scoreboard_hole_t *hole;
1272 :
1273 0 : hole = scoreboard_first_hole (sb);
1274 0 : if (!sb->is_reneging && (!hole || hole->start == tc->snd_una))
1275 0 : return;
1276 :
1277 0 : scoreboard_clear_reneging (sb, tc->snd_una, tc->snd_nxt);
1278 : }
1279 :
1280 : /**
1281 : * Reset congestion control, switch cwnd to loss window and try again.
1282 : */
1283 : static void
1284 0 : tcp_cc_init_rxt_timeout (tcp_connection_t * tc)
1285 : {
1286 : TCP_EVT (TCP_EVT_CC_EVT, tc, 6);
1287 :
1288 0 : tc->prev_ssthresh = tc->ssthresh;
1289 0 : tc->prev_cwnd = tc->cwnd;
1290 :
1291 : /* If we entrered loss without fast recovery, notify cc algo of the
1292 : * congestion event such that it can update ssthresh and its state */
1293 0 : if (!tcp_in_fastrecovery (tc))
1294 0 : tcp_cc_congestion (tc);
1295 :
1296 : /* Let cc algo decide loss cwnd and ssthresh post unrecovered loss */
1297 0 : tcp_cc_loss (tc);
1298 :
1299 0 : tc->rtt_ts = 0;
1300 0 : tc->cwnd_acc_bytes = 0;
1301 0 : tc->tr_occurences += 1;
1302 0 : tc->sack_sb.reorder = TCP_DUPACK_THRESHOLD;
1303 0 : tcp_recovery_on (tc);
1304 0 : }
1305 :
1306 : void
1307 12 : tcp_timer_retransmit_handler (tcp_connection_t * tc)
1308 : {
1309 12 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1310 12 : vlib_main_t *vm = wrk->vm;
1311 12 : vlib_buffer_t *b = 0;
1312 : u32 bi, n_bytes;
1313 :
1314 12 : tcp_worker_stats_inc (wrk, tr_events, 1);
1315 :
1316 : /* Should be handled by a different handler */
1317 12 : if (PREDICT_FALSE (tc->state == TCP_STATE_SYN_SENT))
1318 0 : return;
1319 :
1320 : /* Wait-close and retransmit could pop at the same time */
1321 12 : if (tc->state == TCP_STATE_CLOSED)
1322 0 : return;
1323 :
1324 12 : if (tc->state >= TCP_STATE_ESTABLISHED)
1325 : {
1326 : TCP_EVT (TCP_EVT_CC_EVT, tc, 2);
1327 :
1328 : /* Lost FIN, retransmit and return */
1329 0 : if (tc->flags & TCP_CONN_FINSNT)
1330 : {
1331 0 : tcp_send_fin (tc);
1332 0 : tc->rto_boff += 1;
1333 0 : tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1334 0 : return;
1335 : }
1336 :
1337 : /* Shouldn't be here */
1338 0 : if (tc->snd_una == tc->snd_nxt)
1339 : {
1340 0 : ASSERT (!tcp_in_recovery (tc));
1341 0 : tc->rto_boff = 0;
1342 0 : return;
1343 : }
1344 :
1345 : /* We're not in recovery so make sure rto_boff is 0. Can be non 0 due
1346 : * to persist timer timeout */
1347 0 : if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
1348 : {
1349 0 : tc->rto_boff = 0;
1350 0 : tcp_update_rto (tc);
1351 : }
1352 :
1353 : /* Peer is dead or network connectivity is lost. Close connection.
1354 : * RFC 1122 section 4.2.3.5 recommends a value of at least 100s. For
1355 : * a min rto of 0.2s we need to retry about 8 times. */
1356 0 : if (tc->rto_boff >= TCP_RTO_BOFF_MAX)
1357 : {
1358 0 : tcp_send_reset (tc);
1359 0 : tcp_connection_set_state (tc, TCP_STATE_CLOSED);
1360 0 : session_transport_closing_notify (&tc->connection);
1361 0 : session_transport_closed_notify (&tc->connection);
1362 0 : tcp_connection_timers_reset (tc);
1363 0 : tcp_program_cleanup (wrk, tc);
1364 0 : tcp_worker_stats_inc (wrk, tr_abort, 1);
1365 0 : return;
1366 : }
1367 :
1368 0 : if (tcp_opts_sack_permitted (&tc->rcv_opts))
1369 : {
1370 0 : tcp_check_sack_reneging (tc);
1371 0 : scoreboard_rxt_mark_lost (&tc->sack_sb, tc->snd_una, tc->snd_nxt);
1372 : }
1373 :
1374 : /* Update send congestion to make sure that rxt has data to send */
1375 0 : tc->snd_congestion = tc->snd_nxt;
1376 :
1377 : /* Send the first unacked segment. If we're short on buffers, return
1378 : * as soon as possible */
1379 0 : n_bytes = clib_min (tc->snd_mss, tc->snd_nxt - tc->snd_una);
1380 0 : n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, n_bytes, &b);
1381 0 : if (!n_bytes)
1382 : {
1383 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT,
1384 : tcp_cfg.alloc_err_timeout);
1385 0 : return;
1386 : }
1387 :
1388 0 : bi = vlib_get_buffer_index (vm, b);
1389 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1390 :
1391 0 : tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1392 0 : tcp_retransmit_timer_update (&wrk->timer_wheel, tc);
1393 :
1394 0 : tc->rto_boff += 1;
1395 0 : if (tc->rto_boff == 1)
1396 : {
1397 0 : tcp_cc_init_rxt_timeout (tc);
1398 : /* Record timestamp. Eifel detection algorithm RFC3522 */
1399 0 : tc->snd_rxt_ts = tcp_tstamp (tc);
1400 : }
1401 :
1402 0 : if (tcp_opts_sack_permitted (&tc->rcv_opts))
1403 0 : scoreboard_init_rxt (&tc->sack_sb, tc->snd_una + n_bytes);
1404 :
1405 0 : tcp_program_retransmit (tc);
1406 : }
1407 : /* Retransmit SYN-ACK */
1408 12 : else if (tc->state == TCP_STATE_SYN_RCVD)
1409 : {
1410 : TCP_EVT (TCP_EVT_CC_EVT, tc, 2);
1411 :
1412 12 : tc->rtt_ts = 0;
1413 :
1414 : /* Passive open establish timeout */
1415 12 : if (tc->rto > TCP_ESTABLISH_TIME >> 1)
1416 : {
1417 0 : tcp_connection_set_state (tc, TCP_STATE_CLOSED);
1418 0 : tcp_connection_timers_reset (tc);
1419 0 : tcp_program_cleanup (wrk, tc);
1420 0 : tcp_worker_stats_inc (wrk, tr_abort, 1);
1421 0 : return;
1422 : }
1423 :
1424 12 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1425 : {
1426 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT,
1427 : tcp_cfg.alloc_err_timeout);
1428 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1429 0 : return;
1430 : }
1431 :
1432 12 : tc->rto_boff += 1;
1433 12 : if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
1434 3 : tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1435 :
1436 12 : ASSERT (tc->snd_una != tc->snd_nxt);
1437 12 : tcp_retransmit_timer_update (&wrk->timer_wheel, tc);
1438 :
1439 12 : b = vlib_get_buffer (vm, bi);
1440 12 : tcp_init_buffer (vm, b);
1441 12 : tcp_make_synack (tc, b);
1442 : TCP_EVT (TCP_EVT_SYN_RXT, tc, 1);
1443 :
1444 : /* Retransmit timer already updated, just enqueue to output */
1445 12 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1446 : }
1447 : else
1448 : {
1449 0 : ASSERT (tc->state == TCP_STATE_CLOSED);
1450 0 : return;
1451 : }
1452 : }
1453 :
1454 : /**
1455 : * SYN retransmit timer handler. Active open only.
1456 : */
1457 : void
1458 0 : tcp_timer_retransmit_syn_handler (tcp_connection_t * tc)
1459 : {
1460 0 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1461 0 : vlib_main_t *vm = wrk->vm;
1462 0 : vlib_buffer_t *b = 0;
1463 : u32 bi;
1464 :
1465 : /* Note: the connection may have transitioned to ESTABLISHED... */
1466 0 : if (PREDICT_FALSE (tc->state != TCP_STATE_SYN_SENT))
1467 0 : return;
1468 :
1469 : /* Half-open connection actually moved to established but we were
1470 : * waiting for syn retransmit to pop to call cleanup from the right
1471 : * thread. */
1472 0 : if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
1473 : {
1474 0 : if (tcp_half_open_connection_cleanup (tc))
1475 : TCP_DBG ("could not remove half-open connection");
1476 0 : return;
1477 : }
1478 :
1479 : TCP_EVT (TCP_EVT_CC_EVT, tc, 2);
1480 0 : tc->rtt_ts = 0;
1481 :
1482 : /* Active open establish timeout */
1483 0 : if (tc->rto >= TCP_ESTABLISH_TIME >> 1)
1484 : {
1485 0 : session_stream_connect_notify (&tc->connection, SESSION_E_TIMEDOUT);
1486 0 : tcp_connection_cleanup (tc);
1487 0 : return;
1488 : }
1489 :
1490 0 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1491 : {
1492 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN,
1493 : tcp_cfg.alloc_err_timeout);
1494 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1495 0 : return;
1496 : }
1497 :
1498 : /* Try without increasing RTO a number of times. If this fails,
1499 : * start growing RTO exponentially */
1500 0 : tc->rto_boff += 1;
1501 0 : if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
1502 0 : tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1503 :
1504 0 : b = vlib_get_buffer (vm, bi);
1505 0 : tcp_init_buffer (vm, b);
1506 0 : tcp_make_syn (tc, b);
1507 :
1508 : TCP_EVT (TCP_EVT_SYN_RXT, tc, 0);
1509 :
1510 0 : tcp_enqueue_half_open (wrk, tc, b, bi);
1511 :
1512 0 : tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN,
1513 0 : (u32) tc->rto * TCP_TO_TIMER_TICK);
1514 : }
1515 :
1516 : /**
1517 : * Got 0 snd_wnd from peer, try to do something about it.
1518 : *
1519 : */
1520 : void
1521 0 : tcp_timer_persist_handler (tcp_connection_t * tc)
1522 : {
1523 0 : tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1524 : u32 bi, max_snd_bytes, available_bytes, offset;
1525 0 : tcp_main_t *tm = vnet_get_tcp_main ();
1526 0 : vlib_main_t *vm = wrk->vm;
1527 : vlib_buffer_t *b;
1528 0 : int n_bytes = 0;
1529 : u8 *data;
1530 :
1531 : /* Problem already solved or worse */
1532 0 : if (tc->state == TCP_STATE_CLOSED || tc->snd_wnd > tc->snd_mss
1533 0 : || (tc->flags & TCP_CONN_FINSNT))
1534 0 : goto update_scheduler;
1535 :
1536 0 : available_bytes = transport_max_tx_dequeue (&tc->connection);
1537 0 : offset = tc->snd_nxt - tc->snd_una;
1538 :
1539 : /* Reprogram persist if no new bytes available to send. We may have data
1540 : * next time */
1541 0 : if (!available_bytes)
1542 : {
1543 0 : tcp_persist_timer_set (&wrk->timer_wheel, tc);
1544 0 : return;
1545 : }
1546 :
1547 0 : if (available_bytes <= offset)
1548 0 : goto update_scheduler;
1549 :
1550 : /* Increment RTO backoff */
1551 0 : tc->rto_boff += 1;
1552 0 : tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1553 :
1554 : /*
1555 : * Try to force the first unsent segment (or buffer)
1556 : */
1557 0 : if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1558 : {
1559 0 : tcp_persist_timer_set (&wrk->timer_wheel, tc);
1560 0 : tcp_worker_stats_inc (wrk, no_buffer, 1);
1561 0 : return;
1562 : }
1563 :
1564 0 : b = vlib_get_buffer (vm, bi);
1565 0 : data = tcp_init_buffer (vm, b);
1566 :
1567 0 : tcp_validate_txf_size (tc, offset);
1568 0 : tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
1569 0 : max_snd_bytes = clib_min (clib_min (tc->snd_mss, available_bytes),
1570 : tm->bytes_per_buffer - TRANSPORT_MAX_HDRS_LEN);
1571 0 : if (tc->snd_wnd > 0)
1572 0 : max_snd_bytes = clib_min (tc->snd_wnd, max_snd_bytes);
1573 0 : n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1574 : max_snd_bytes);
1575 0 : b->current_length = n_bytes;
1576 0 : ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)
1577 : || tc->snd_una == tc->snd_nxt
1578 : || tc->rto_boff > 1));
1579 :
1580 0 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1581 : {
1582 0 : tcp_bt_check_app_limited (tc);
1583 0 : tcp_bt_track_tx (tc, n_bytes);
1584 : }
1585 :
1586 0 : tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0,
1587 : /* burst */ 0, /* update_snd_nxt */ 1);
1588 0 : tcp_validate_txf_size (tc, tc->snd_nxt - tc->snd_una);
1589 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1590 :
1591 : /* Just sent new data, enable retransmit */
1592 0 : tcp_retransmit_timer_update (&wrk->timer_wheel, tc);
1593 :
1594 0 : return;
1595 :
1596 0 : update_scheduler:
1597 :
1598 0 : if (tcp_is_descheduled (tc))
1599 0 : transport_connection_reschedule (&tc->connection);
1600 : }
1601 :
1602 : /**
1603 : * Retransmit first unacked segment
1604 : */
1605 : int
1606 0 : tcp_retransmit_first_unacked (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
1607 : {
1608 0 : vlib_main_t *vm = wrk->vm;
1609 : vlib_buffer_t *b;
1610 : u32 bi, n_bytes;
1611 :
1612 : TCP_EVT (TCP_EVT_CC_EVT, tc, 1);
1613 :
1614 0 : n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b);
1615 0 : if (!n_bytes)
1616 0 : return -1;
1617 :
1618 0 : bi = vlib_get_buffer_index (vm, b);
1619 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1620 :
1621 0 : return 0;
1622 : }
1623 :
1624 : static int
1625 0 : tcp_transmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
1626 : u32 burst_size)
1627 : {
1628 0 : u32 offset, n_segs = 0, n_written, bi, available_wnd;
1629 0 : vlib_main_t *vm = wrk->vm;
1630 0 : vlib_buffer_t *b = 0;
1631 :
1632 0 : offset = tc->snd_nxt - tc->snd_una;
1633 0 : available_wnd = tc->snd_wnd - offset;
1634 0 : burst_size = clib_min (burst_size, available_wnd / tc->snd_mss);
1635 :
1636 0 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1637 0 : tcp_bt_check_app_limited (tc);
1638 :
1639 0 : while (n_segs < burst_size)
1640 : {
1641 0 : n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b);
1642 0 : if (!n_written)
1643 0 : goto done;
1644 :
1645 0 : bi = vlib_get_buffer_index (vm, b);
1646 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1647 0 : offset += n_written;
1648 0 : n_segs += 1;
1649 :
1650 0 : if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1651 0 : tcp_bt_track_tx (tc, n_written);
1652 :
1653 0 : tc->snd_nxt += n_written;
1654 : }
1655 :
1656 0 : done:
1657 0 : return n_segs;
1658 : }
1659 :
1660 : /**
1661 : * Estimate send space using proportional rate reduction (RFC6937)
1662 : */
1663 : int
1664 0 : tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc)
1665 : {
1666 : u32 pipe, prr_out;
1667 : int space;
1668 :
1669 0 : pipe = tcp_flight_size (tc);
1670 0 : prr_out = tc->snd_rxt_bytes + (tc->snd_nxt - tc->snd_congestion);
1671 :
1672 0 : if (pipe > tc->ssthresh)
1673 : {
1674 0 : space = ((int) tc->prr_delivered * ((f64) tc->ssthresh / tc->prev_cwnd))
1675 0 : - prr_out;
1676 : }
1677 : else
1678 : {
1679 : int limit;
1680 0 : limit = clib_max ((int) (tc->prr_delivered - prr_out), 0) + tc->snd_mss;
1681 0 : space = clib_min (tc->ssthresh - pipe, limit);
1682 : }
1683 0 : space = clib_max (space, prr_out ? 0 : tc->snd_mss);
1684 0 : return space;
1685 : }
1686 :
1687 : static inline u8
1688 0 : tcp_retransmit_should_retry_head (tcp_connection_t * tc,
1689 : sack_scoreboard_t * sb)
1690 : {
1691 0 : u32 tx_adv_sack = sb->high_sacked - tc->snd_congestion;
1692 0 : f64 rr = (f64) tc->ssthresh / tc->prev_cwnd;
1693 :
1694 0 : if (tcp_fastrecovery_first (tc))
1695 0 : return 1;
1696 :
1697 0 : return (tx_adv_sack > (tc->snd_una - tc->prr_start) * rr);
1698 : }
1699 :
1700 : static inline u8
1701 41826 : tcp_max_tx_deq (tcp_connection_t * tc)
1702 : {
1703 41826 : return (transport_max_tx_dequeue (&tc->connection)
1704 41826 : - (tc->snd_nxt - tc->snd_una));
1705 : }
1706 :
1707 : #define scoreboard_rescue_rxt_valid(_sb, _tc) \
1708 : (seq_geq (_sb->rescue_rxt, _tc->snd_una) \
1709 : && seq_leq (_sb->rescue_rxt, _tc->snd_congestion))
1710 :
1711 : /**
1712 : * Do retransmit with SACKs
1713 : */
1714 : static int
1715 0 : tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
1716 : u32 burst_size)
1717 : {
1718 0 : u32 n_written = 0, offset, max_bytes, n_segs = 0;
1719 0 : u8 snd_limited = 0, can_rescue = 0;
1720 : u32 bi, max_deq, burst_bytes;
1721 : sack_scoreboard_hole_t *hole;
1722 0 : vlib_main_t *vm = wrk->vm;
1723 0 : vlib_buffer_t *b = 0;
1724 : sack_scoreboard_t *sb;
1725 : int snd_space;
1726 :
1727 0 : ASSERT (tcp_in_cong_recovery (tc));
1728 :
1729 0 : burst_bytes = transport_connection_tx_pacer_burst (&tc->connection);
1730 0 : burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
1731 0 : if (!burst_size)
1732 : {
1733 0 : tcp_program_retransmit (tc);
1734 0 : return 0;
1735 : }
1736 :
1737 0 : if (tcp_in_recovery (tc))
1738 0 : snd_space = tcp_available_cc_snd_space (tc);
1739 : else
1740 0 : snd_space = tcp_fastrecovery_prr_snd_space (tc);
1741 :
1742 0 : if (snd_space < tc->snd_mss)
1743 0 : goto done;
1744 :
1745 0 : sb = &tc->sack_sb;
1746 :
1747 : /* Check if snd_una is a lost retransmit */
1748 0 : if (pool_elts (sb->holes)
1749 0 : && seq_gt (sb->high_sacked, tc->snd_congestion)
1750 0 : && tc->rxt_head != tc->snd_una
1751 0 : && tcp_retransmit_should_retry_head (tc, sb))
1752 : {
1753 0 : max_bytes = clib_min (tc->snd_mss, tc->snd_nxt - tc->snd_una);
1754 0 : n_written = tcp_prepare_retransmit_segment (wrk, tc, 0, max_bytes, &b);
1755 0 : if (!n_written)
1756 : {
1757 0 : tcp_program_retransmit (tc);
1758 0 : goto done;
1759 : }
1760 0 : bi = vlib_get_buffer_index (vm, b);
1761 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1762 0 : n_segs = 1;
1763 :
1764 0 : tc->rxt_head = tc->snd_una;
1765 0 : tc->rxt_delivered += n_written;
1766 0 : tc->prr_delivered += n_written;
1767 0 : ASSERT (tc->rxt_delivered <= tc->snd_rxt_bytes);
1768 : }
1769 :
1770 0 : tcp_fastrecovery_first_off (tc);
1771 :
1772 : TCP_EVT (TCP_EVT_CC_EVT, tc, 0);
1773 0 : hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
1774 :
1775 0 : max_deq = transport_max_tx_dequeue (&tc->connection);
1776 0 : max_deq -= tc->snd_nxt - tc->snd_una;
1777 :
1778 0 : while (snd_space > 0 && n_segs < burst_size)
1779 : {
1780 0 : hole = scoreboard_next_rxt_hole (sb, hole, max_deq != 0, &can_rescue,
1781 : &snd_limited);
1782 0 : if (!hole)
1783 : {
1784 : /* We are out of lost holes to retransmit so send some new data. */
1785 0 : if (max_deq > tc->snd_mss)
1786 : {
1787 : u32 n_segs_new;
1788 : int av_wnd;
1789 :
1790 : /* Make sure we don't exceed available window and leave space
1791 : * for one more packet, to avoid zero window acks */
1792 0 : av_wnd = (int) tc->snd_wnd - (tc->snd_nxt - tc->snd_una);
1793 0 : av_wnd = clib_max (av_wnd - tc->snd_mss, 0);
1794 0 : snd_space = clib_min (snd_space, av_wnd);
1795 0 : snd_space = clib_min (max_deq, snd_space);
1796 0 : burst_size = clib_min (burst_size - n_segs,
1797 : snd_space / tc->snd_mss);
1798 0 : burst_size = clib_min (burst_size, TCP_RXT_MAX_BURST);
1799 0 : n_segs_new = tcp_transmit_unsent (wrk, tc, burst_size);
1800 0 : if (max_deq > n_segs_new * tc->snd_mss)
1801 0 : tcp_program_retransmit (tc);
1802 :
1803 0 : n_segs += n_segs_new;
1804 0 : goto done;
1805 : }
1806 :
1807 0 : if (tcp_in_recovery (tc) || !can_rescue
1808 0 : || scoreboard_rescue_rxt_valid (sb, tc))
1809 : break;
1810 :
1811 : /* If rescue rxt undefined or less than snd_una then one segment of
1812 : * up to SMSS octets that MUST include the highest outstanding
1813 : * unSACKed sequence number SHOULD be returned, and RescueRxt set to
1814 : * RecoveryPoint. HighRxt MUST NOT be updated.
1815 : */
1816 0 : hole = scoreboard_last_hole (sb);
1817 0 : max_bytes = clib_min (tc->snd_mss, hole->end - hole->start);
1818 0 : max_bytes = clib_min (max_bytes, snd_space);
1819 0 : offset = hole->end - tc->snd_una - max_bytes;
1820 0 : n_written = tcp_prepare_retransmit_segment (wrk, tc, offset,
1821 : max_bytes, &b);
1822 0 : if (!n_written)
1823 0 : goto done;
1824 :
1825 0 : sb->rescue_rxt = tc->snd_congestion;
1826 0 : bi = vlib_get_buffer_index (vm, b);
1827 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1828 0 : n_segs += 1;
1829 0 : break;
1830 : }
1831 :
1832 0 : max_bytes = clib_min (hole->end - sb->high_rxt, snd_space);
1833 0 : max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes;
1834 0 : if (max_bytes == 0)
1835 0 : break;
1836 :
1837 0 : offset = sb->high_rxt - tc->snd_una;
1838 0 : n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes,
1839 : &b);
1840 0 : ASSERT (n_written <= snd_space);
1841 :
1842 : /* Nothing left to retransmit */
1843 0 : if (n_written == 0)
1844 0 : break;
1845 :
1846 0 : bi = vlib_get_buffer_index (vm, b);
1847 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1848 :
1849 0 : sb->high_rxt += n_written;
1850 0 : ASSERT (seq_leq (sb->high_rxt, tc->snd_nxt));
1851 :
1852 0 : snd_space -= n_written;
1853 0 : n_segs += 1;
1854 : }
1855 :
1856 0 : if (hole)
1857 0 : tcp_program_retransmit (tc);
1858 :
1859 0 : done:
1860 :
1861 0 : transport_connection_tx_pacer_reset_bucket (&tc->connection, 0);
1862 0 : return n_segs;
1863 : }
1864 :
1865 : /**
1866 : * Fast retransmit without SACK info
1867 : */
1868 : static int
1869 0 : tcp_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
1870 : u32 burst_size)
1871 : {
1872 0 : u32 n_written = 0, offset = 0, bi, max_deq, n_segs_now, max_bytes;
1873 : u32 burst_bytes, sent_bytes;
1874 0 : vlib_main_t *vm = wrk->vm;
1875 0 : int snd_space, n_segs = 0;
1876 0 : u8 cc_limited = 0;
1877 : vlib_buffer_t *b;
1878 :
1879 0 : ASSERT (tcp_in_cong_recovery (tc));
1880 : TCP_EVT (TCP_EVT_CC_EVT, tc, 0);
1881 :
1882 0 : burst_bytes = transport_connection_tx_pacer_burst (&tc->connection);
1883 0 : burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
1884 0 : if (!burst_size)
1885 : {
1886 0 : tcp_program_retransmit (tc);
1887 0 : return 0;
1888 : }
1889 :
1890 0 : snd_space = tcp_available_cc_snd_space (tc);
1891 0 : cc_limited = snd_space < burst_bytes;
1892 :
1893 0 : if (!tcp_fastrecovery_first (tc))
1894 0 : goto send_unsent;
1895 :
1896 : /* RFC 6582: [If a partial ack], retransmit the first unacknowledged
1897 : * segment. */
1898 0 : while (snd_space > 0 && n_segs < burst_size)
1899 : {
1900 0 : max_bytes = clib_min (tc->snd_mss,
1901 : tc->snd_congestion - tc->snd_una - offset);
1902 0 : if (!max_bytes)
1903 0 : break;
1904 0 : n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes,
1905 : &b);
1906 :
1907 : /* Nothing left to retransmit */
1908 0 : if (n_written == 0)
1909 0 : break;
1910 :
1911 0 : bi = vlib_get_buffer_index (vm, b);
1912 0 : tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1913 0 : snd_space -= n_written;
1914 0 : offset += n_written;
1915 0 : n_segs += 1;
1916 : }
1917 :
1918 0 : if (n_segs == burst_size)
1919 0 : goto done;
1920 :
1921 0 : send_unsent:
1922 :
1923 : /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */
1924 0 : if (snd_space < tc->snd_mss || tc->snd_mss == 0)
1925 0 : goto done;
1926 :
1927 0 : max_deq = transport_max_tx_dequeue (&tc->connection);
1928 0 : max_deq -= tc->snd_nxt - tc->snd_una;
1929 0 : if (max_deq)
1930 : {
1931 0 : snd_space = clib_min (max_deq, snd_space);
1932 0 : burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss);
1933 0 : n_segs_now = tcp_transmit_unsent (wrk, tc, burst_size);
1934 0 : if (n_segs_now && max_deq > n_segs_now * tc->snd_mss)
1935 0 : tcp_program_retransmit (tc);
1936 0 : n_segs += n_segs_now;
1937 : }
1938 :
1939 0 : done:
1940 0 : tcp_fastrecovery_first_off (tc);
1941 :
1942 0 : sent_bytes = clib_min (n_segs * tc->snd_mss, burst_bytes);
1943 0 : sent_bytes = cc_limited ? burst_bytes : sent_bytes;
1944 0 : transport_connection_tx_pacer_update_bytes (&tc->connection, sent_bytes);
1945 :
1946 0 : return n_segs;
1947 : }
1948 :
1949 : static int
1950 41826 : tcp_send_acks (tcp_connection_t * tc, u32 max_burst_size)
1951 : {
1952 : int j, n_acks;
1953 :
1954 41826 : if (!tc->pending_dupacks)
1955 : {
1956 41826 : if (tcp_in_cong_recovery (tc) || !tcp_max_tx_deq (tc)
1957 9152 : || tc->state != TCP_STATE_ESTABLISHED)
1958 : {
1959 32676 : tcp_send_ack (tc);
1960 32676 : return 1;
1961 : }
1962 9150 : return 0;
1963 : }
1964 :
1965 : /* If we're supposed to send dupacks but have no ooo data
1966 : * send only one ack */
1967 0 : if (!vec_len (tc->snd_sacks))
1968 : {
1969 0 : tcp_send_ack (tc);
1970 0 : tc->dupacks_out += 1;
1971 0 : tc->pending_dupacks = 0;
1972 0 : return 1;
1973 : }
1974 :
1975 : /* Start with first sack block */
1976 0 : tc->snd_sack_pos = 0;
1977 :
1978 : /* Generate enough dupacks to cover all sack blocks. Do not generate
1979 : * more sacks than the number of packets received. But do generate at
1980 : * least 3, i.e., the number needed to signal congestion, if needed. */
1981 0 : n_acks = vec_len (tc->snd_sacks) / TCP_OPTS_MAX_SACK_BLOCKS;
1982 0 : n_acks = clib_min (n_acks, tc->pending_dupacks);
1983 0 : n_acks = clib_max (n_acks, clib_min (tc->pending_dupacks, 3));
1984 0 : for (j = 0; j < clib_min (n_acks, max_burst_size); j++)
1985 0 : tcp_send_ack (tc);
1986 :
1987 0 : if (n_acks < max_burst_size)
1988 : {
1989 0 : tc->pending_dupacks = 0;
1990 0 : tc->snd_sack_pos = 0;
1991 0 : tc->dupacks_out += n_acks;
1992 0 : return n_acks;
1993 : }
1994 : else
1995 : {
1996 : TCP_DBG ("constrained by burst size");
1997 0 : tc->pending_dupacks = n_acks - max_burst_size;
1998 0 : tc->dupacks_out += max_burst_size;
1999 0 : tcp_program_dupack (tc);
2000 0 : return max_burst_size;
2001 : }
2002 : }
2003 :
2004 : static int
2005 0 : tcp_do_retransmit (tcp_connection_t * tc, u32 max_burst_size)
2006 : {
2007 : tcp_worker_ctx_t *wrk;
2008 : u32 n_segs;
2009 :
2010 0 : if (PREDICT_FALSE (tc->state == TCP_STATE_CLOSED))
2011 0 : return 0;
2012 :
2013 0 : wrk = tcp_get_worker (tc->c_thread_index);
2014 :
2015 0 : if (tcp_opts_sack_permitted (&tc->rcv_opts))
2016 0 : n_segs = tcp_retransmit_sack (wrk, tc, max_burst_size);
2017 : else
2018 0 : n_segs = tcp_retransmit_no_sack (wrk, tc, max_burst_size);
2019 :
2020 0 : return n_segs;
2021 : }
2022 :
2023 : int
2024 41829 : tcp_session_custom_tx (void *conn, transport_send_params_t * sp)
2025 : {
2026 41829 : tcp_connection_t *tc = (tcp_connection_t *) conn;
2027 41829 : u32 n_segs = 0;
2028 :
2029 41829 : if (tcp_in_cong_recovery (tc) && (tc->flags & TCP_CONN_RXT_PENDING))
2030 : {
2031 0 : tc->flags &= ~TCP_CONN_RXT_PENDING;
2032 0 : n_segs = tcp_do_retransmit (tc, sp->max_burst_size);
2033 : }
2034 :
2035 41829 : if (!(tc->flags & TCP_CONN_SNDACK))
2036 3 : return n_segs;
2037 :
2038 41826 : tc->flags &= ~TCP_CONN_SNDACK;
2039 :
2040 : /* We have retransmitted packets and no dupack */
2041 41826 : if (n_segs && !tc->pending_dupacks)
2042 0 : return n_segs;
2043 :
2044 41826 : if (sp->max_burst_size <= n_segs)
2045 : {
2046 0 : tcp_program_ack (tc);
2047 0 : return n_segs;
2048 : }
2049 :
2050 41826 : n_segs += tcp_send_acks (tc, sp->max_burst_size - n_segs);
2051 :
2052 41826 : return n_segs;
2053 : }
2054 : #endif /* CLIB_MARCH_VARIANT */
2055 :
2056 : static void
2057 0 : tcp_output_handle_link_local (tcp_connection_t * tc0, vlib_buffer_t * b0,
2058 : u16 * next0, u32 * error0)
2059 : {
2060 : ip_adjacency_t *adj;
2061 : adj_index_t ai;
2062 :
2063 : /* Not thread safe but as long as the connection exists the adj should
2064 : * not be removed */
2065 0 : ai = adj_nbr_find (FIB_PROTOCOL_IP6, VNET_LINK_IP6, &tc0->c_rmt_ip,
2066 : tc0->sw_if_index);
2067 0 : if (ai == ADJ_INDEX_INVALID)
2068 : {
2069 0 : vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
2070 0 : *next0 = TCP_OUTPUT_NEXT_DROP;
2071 0 : *error0 = TCP_ERROR_LINK_LOCAL_RW;
2072 0 : return;
2073 : }
2074 :
2075 0 : adj = adj_get (ai);
2076 0 : if (PREDICT_TRUE (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE))
2077 0 : *next0 = TCP_OUTPUT_NEXT_IP_REWRITE;
2078 0 : else if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
2079 0 : *next0 = TCP_OUTPUT_NEXT_IP_ARP;
2080 : else
2081 : {
2082 0 : *next0 = TCP_OUTPUT_NEXT_DROP;
2083 0 : *error0 = TCP_ERROR_LINK_LOCAL_RW;
2084 : }
2085 0 : vnet_buffer (b0)->ip.adj_index[VLIB_TX] = ai;
2086 : }
2087 :
2088 : static void
2089 0 : tcp46_output_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node,
2090 : u32 * to_next, u32 n_bufs)
2091 : {
2092 : tcp_connection_t *tc;
2093 : tcp_tx_trace_t *t;
2094 : vlib_buffer_t *b;
2095 : tcp_header_t *th;
2096 : int i;
2097 :
2098 0 : for (i = 0; i < n_bufs; i++)
2099 : {
2100 0 : b = vlib_get_buffer (vm, to_next[i]);
2101 0 : if (!(b->flags & VLIB_BUFFER_IS_TRACED))
2102 0 : continue;
2103 0 : th = vlib_buffer_get_current (b);
2104 0 : tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index,
2105 : vm->thread_index);
2106 0 : t = vlib_add_trace (vm, node, b, sizeof (*t));
2107 0 : clib_memcpy_fast (&t->tcp_header, th, sizeof (t->tcp_header));
2108 0 : clib_memcpy_fast (&t->tcp_connection, tc, sizeof (t->tcp_connection));
2109 : }
2110 0 : }
2111 :
2112 : always_inline void
2113 1062440 : tcp_output_push_ip (vlib_main_t * vm, vlib_buffer_t * b0,
2114 : tcp_connection_t * tc0, u8 is_ip4)
2115 : {
2116 : TCP_EVT (TCP_EVT_OUTPUT, tc0,
2117 : ((tcp_header_t *) vlib_buffer_get_current (b0))->flags,
2118 : b0->current_length);
2119 :
2120 1062440 : if (is_ip4)
2121 1062420 : vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
2122 1062420 : IP_PROTOCOL_TCP, tcp_csum_offload (tc0));
2123 : else
2124 17 : vlib_buffer_push_ip6_custom (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6,
2125 : IP_PROTOCOL_TCP, tc0->ipv6_flow_label);
2126 1062440 : }
2127 :
2128 : always_inline void
2129 1062440 : tcp_check_if_gso (tcp_connection_t * tc, vlib_buffer_t * b)
2130 : {
2131 1062440 : if (PREDICT_TRUE (!(tc->cfg_flags & TCP_CFG_F_TSO)))
2132 1062440 : return;
2133 :
2134 0 : u16 data_len = b->current_length - sizeof (tcp_header_t) - tc->snd_opts_len;
2135 :
2136 0 : if (PREDICT_FALSE (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID))
2137 0 : data_len += b->total_length_not_including_first_buffer;
2138 :
2139 0 : if (PREDICT_TRUE (data_len <= tc->snd_mss))
2140 0 : return;
2141 : else
2142 : {
2143 0 : ASSERT ((b->flags & VNET_BUFFER_F_L3_HDR_OFFSET_VALID) != 0);
2144 0 : ASSERT ((b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID) != 0);
2145 0 : b->flags |= VNET_BUFFER_F_GSO;
2146 0 : vnet_buffer2 (b)->gso_l4_hdr_sz =
2147 0 : sizeof (tcp_header_t) + tc->snd_opts_len;
2148 0 : vnet_buffer2 (b)->gso_size = tc->snd_mss;
2149 : }
2150 : }
2151 :
2152 : always_inline void
2153 1062440 : tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0,
2154 : vlib_node_runtime_t * error_node, u16 * next0,
2155 : u8 is_ip4)
2156 : {
2157 : /* If next_index is not drop use it */
2158 1062440 : if (tc0->next_node_index)
2159 : {
2160 0 : *next0 = tc0->next_node_index;
2161 0 : vnet_buffer (b0)->tcp.next_node_opaque = tc0->next_node_opaque;
2162 : }
2163 : else
2164 : {
2165 1062440 : *next0 = TCP_OUTPUT_NEXT_IP_LOOKUP;
2166 : }
2167 :
2168 1062440 : vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index;
2169 1062440 : vnet_buffer (b0)->sw_if_index[VLIB_RX] = tc0->sw_if_index;
2170 :
2171 1062440 : if (!is_ip4)
2172 : {
2173 17 : u32 error0 = 0;
2174 :
2175 17 : if (PREDICT_FALSE (ip6_address_is_link_local_unicast (&tc0->c_rmt_ip6)))
2176 0 : tcp_output_handle_link_local (tc0, b0, next0, &error0);
2177 :
2178 17 : if (PREDICT_FALSE (error0))
2179 : {
2180 0 : b0->error = error_node->errors[error0];
2181 0 : return;
2182 : }
2183 : }
2184 :
2185 1062440 : tc0->segs_out += 1;
2186 : }
2187 :
2188 : always_inline uword
2189 38176 : tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
2190 : vlib_frame_t * frame, int is_ip4)
2191 : {
2192 38176 : u32 n_left_from, *from, thread_index = vm->thread_index;
2193 : vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2194 : u16 nexts[VLIB_FRAME_SIZE], *next;
2195 38176 : u16 err_counters[TCP_N_ERROR] = { 0 };
2196 :
2197 38176 : from = vlib_frame_vector_args (frame);
2198 38176 : n_left_from = frame->n_vectors;
2199 38176 : tcp_update_time_now (tcp_get_worker (thread_index));
2200 :
2201 38176 : if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
2202 0 : tcp46_output_trace_frame (vm, node, from, n_left_from);
2203 :
2204 38176 : vlib_get_buffers (vm, from, bufs, n_left_from);
2205 38176 : b = bufs;
2206 38176 : next = nexts;
2207 :
2208 534839 : while (n_left_from >= 4)
2209 : {
2210 : tcp_connection_t *tc0, *tc1;
2211 :
2212 : {
2213 496663 : vlib_prefetch_buffer_header (b[2], STORE);
2214 496663 : CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2215 :
2216 496663 : vlib_prefetch_buffer_header (b[3], STORE);
2217 496663 : CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2218 : }
2219 :
2220 496663 : tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
2221 : thread_index);
2222 496663 : tc1 = tcp_connection_get (vnet_buffer (b[1])->tcp.connection_index,
2223 : thread_index);
2224 :
2225 496663 : if (PREDICT_TRUE (!tc0 + !tc1 == 0))
2226 : {
2227 496663 : tcp_output_push_ip (vm, b[0], tc0, is_ip4);
2228 496663 : tcp_output_push_ip (vm, b[1], tc1, is_ip4);
2229 :
2230 496663 : tcp_check_if_gso (tc0, b[0]);
2231 496663 : tcp_check_if_gso (tc1, b[1]);
2232 :
2233 496663 : tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4);
2234 496663 : tcp_output_handle_packet (tc1, b[1], node, &next[1], is_ip4);
2235 : }
2236 : else
2237 : {
2238 0 : if (tc0 != 0)
2239 : {
2240 0 : tcp_output_push_ip (vm, b[0], tc0, is_ip4);
2241 0 : tcp_check_if_gso (tc0, b[0]);
2242 0 : tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4);
2243 : }
2244 : else
2245 : {
2246 0 : tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION,
2247 : 1);
2248 0 : next[0] = TCP_OUTPUT_NEXT_DROP;
2249 : }
2250 0 : if (tc1 != 0)
2251 : {
2252 0 : tcp_output_push_ip (vm, b[1], tc1, is_ip4);
2253 0 : tcp_check_if_gso (tc1, b[1]);
2254 0 : tcp_output_handle_packet (tc1, b[1], node, &next[1], is_ip4);
2255 : }
2256 : else
2257 : {
2258 0 : tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION,
2259 : 1);
2260 0 : next[1] = TCP_OUTPUT_NEXT_DROP;
2261 : }
2262 : }
2263 :
2264 496663 : b += 2;
2265 496663 : next += 2;
2266 496663 : n_left_from -= 2;
2267 : }
2268 107291 : while (n_left_from > 0)
2269 : {
2270 : tcp_connection_t *tc0;
2271 :
2272 69115 : if (n_left_from > 1)
2273 : {
2274 30939 : vlib_prefetch_buffer_header (b[1], STORE);
2275 30939 : CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2276 : }
2277 :
2278 69115 : tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
2279 : thread_index);
2280 :
2281 69115 : if (PREDICT_TRUE (tc0 != 0))
2282 : {
2283 69115 : tcp_output_push_ip (vm, b[0], tc0, is_ip4);
2284 69115 : tcp_check_if_gso (tc0, b[0]);
2285 69115 : tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4);
2286 : }
2287 : else
2288 : {
2289 0 : tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION, 1);
2290 0 : next[0] = TCP_OUTPUT_NEXT_DROP;
2291 : }
2292 :
2293 69115 : b += 1;
2294 69115 : next += 1;
2295 69115 : n_left_from -= 1;
2296 : }
2297 :
2298 1450690 : tcp_store_err_counters (output, err_counters);
2299 38176 : vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2300 38176 : vlib_node_increment_counter (vm, tcp_node_index (output, is_ip4),
2301 38176 : TCP_ERROR_PKTS_SENT, frame->n_vectors);
2302 38176 : return frame->n_vectors;
2303 : }
2304 :
2305 40459 : VLIB_NODE_FN (tcp4_output_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2306 : vlib_frame_t * from_frame)
2307 : {
2308 38159 : return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2309 : }
2310 :
2311 2317 : VLIB_NODE_FN (tcp6_output_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2312 : vlib_frame_t * from_frame)
2313 : {
2314 17 : return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2315 : }
2316 :
2317 : /* *INDENT-OFF* */
2318 183788 : VLIB_REGISTER_NODE (tcp4_output_node) =
2319 : {
2320 : .name = "tcp4-output",
2321 : /* Takes a vector of packets. */
2322 : .vector_size = sizeof (u32),
2323 : .n_errors = TCP_N_ERROR,
2324 : .protocol_hint = VLIB_NODE_PROTO_HINT_TCP,
2325 : .error_counters = tcp_output_error_counters,
2326 : .n_next_nodes = TCP_OUTPUT_N_NEXT,
2327 : .next_nodes = {
2328 : #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
2329 : foreach_tcp4_output_next
2330 : #undef _
2331 : },
2332 : .format_buffer = format_tcp_header,
2333 : .format_trace = format_tcp_tx_trace,
2334 : };
2335 : /* *INDENT-ON* */
2336 :
2337 : /* *INDENT-OFF* */
2338 183788 : VLIB_REGISTER_NODE (tcp6_output_node) =
2339 : {
2340 : .name = "tcp6-output",
2341 : /* Takes a vector of packets. */
2342 : .vector_size = sizeof (u32),
2343 : .n_errors = TCP_N_ERROR,
2344 : .protocol_hint = VLIB_NODE_PROTO_HINT_TCP,
2345 : .error_counters = tcp_output_error_counters,
2346 : .n_next_nodes = TCP_OUTPUT_N_NEXT,
2347 : .next_nodes = {
2348 : #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
2349 : foreach_tcp6_output_next
2350 : #undef _
2351 : },
2352 : .format_buffer = format_tcp_header,
2353 : .format_trace = format_tcp_tx_trace,
2354 : };
2355 : /* *INDENT-ON* */
2356 :
2357 : typedef enum _tcp_reset_next
2358 : {
2359 : TCP_RESET_NEXT_DROP,
2360 : TCP_RESET_NEXT_IP_LOOKUP,
2361 : TCP_RESET_N_NEXT
2362 : } tcp_reset_next_t;
2363 :
2364 : #define foreach_tcp4_reset_next \
2365 : _(DROP, "error-drop") \
2366 : _(IP_LOOKUP, "ip4-lookup")
2367 :
2368 : #define foreach_tcp6_reset_next \
2369 : _(DROP, "error-drop") \
2370 : _(IP_LOOKUP, "ip6-lookup")
2371 :
2372 : static void
2373 0 : tcp_reset_trace_frame (vlib_main_t *vm, vlib_node_runtime_t *node,
2374 : vlib_buffer_t **bs, u32 n_bufs, u8 is_ip4)
2375 : {
2376 : tcp_header_t *tcp;
2377 : tcp_tx_trace_t *t;
2378 : int i;
2379 :
2380 0 : for (i = 0; i < n_bufs; i++)
2381 : {
2382 0 : if (bs[i]->flags & VLIB_BUFFER_IS_TRACED)
2383 : {
2384 0 : tcp = vlib_buffer_get_current (bs[i]);
2385 0 : t = vlib_add_trace (vm, node, bs[i], sizeof (*t));
2386 :
2387 0 : if (is_ip4)
2388 : {
2389 0 : ip4_header_t *ih4 = vlib_buffer_get_current (bs[i]);
2390 0 : tcp = ip4_next_header (ih4);
2391 0 : t->tcp_connection.c_lcl_ip.ip4 = ih4->dst_address;
2392 0 : t->tcp_connection.c_rmt_ip.ip4 = ih4->src_address;
2393 0 : t->tcp_connection.c_is_ip4 = 1;
2394 : }
2395 : else
2396 : {
2397 0 : ip6_header_t *ih6 = vlib_buffer_get_current (bs[i]);
2398 0 : tcp = ip6_next_header (ih6);
2399 0 : t->tcp_connection.c_lcl_ip.ip6 = ih6->dst_address;
2400 0 : t->tcp_connection.c_rmt_ip.ip6 = ih6->src_address;
2401 : }
2402 0 : t->tcp_connection.c_lcl_port = tcp->dst_port;
2403 0 : t->tcp_connection.c_rmt_port = tcp->src_port;
2404 0 : t->tcp_connection.c_proto = TRANSPORT_PROTO_TCP;
2405 0 : clib_memcpy_fast (&t->tcp_header, tcp, sizeof (t->tcp_header));
2406 : }
2407 : }
2408 0 : }
2409 :
2410 : static uword
2411 2 : tcp46_reset_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
2412 : vlib_frame_t *frame, u8 is_ip4)
2413 : {
2414 : vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2415 : u16 nexts[VLIB_FRAME_SIZE], *next;
2416 : u32 n_left_from, *from;
2417 :
2418 2 : from = vlib_frame_vector_args (frame);
2419 2 : n_left_from = frame->n_vectors;
2420 2 : vlib_get_buffers (vm, from, bufs, n_left_from);
2421 :
2422 2 : b = bufs;
2423 2 : next = nexts;
2424 :
2425 88 : while (n_left_from > 0)
2426 : {
2427 86 : tcp_buffer_make_reset (vm, b[0], is_ip4);
2428 :
2429 : /* IP lookup in fib where it was received. Previous value
2430 : * was overwritten by tcp-input */
2431 86 : vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
2432 86 : vec_elt (ip4_main.fib_index_by_sw_if_index,
2433 : vnet_buffer (b[0])->sw_if_index[VLIB_RX]);
2434 :
2435 86 : b[0]->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
2436 86 : next[0] = TCP_RESET_NEXT_IP_LOOKUP;
2437 :
2438 86 : b += 1;
2439 86 : next += 1;
2440 86 : n_left_from -= 1;
2441 : }
2442 :
2443 2 : if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
2444 0 : tcp_reset_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4);
2445 :
2446 2 : vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2447 :
2448 2 : vlib_node_increment_counter (vm, node->node_index, TCP_ERROR_RST_SENT,
2449 2 : frame->n_vectors);
2450 :
2451 2 : return frame->n_vectors;
2452 : }
2453 :
2454 2302 : VLIB_NODE_FN (tcp4_reset_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2455 : vlib_frame_t * from_frame)
2456 : {
2457 2 : return tcp46_reset_inline (vm, node, from_frame, 1);
2458 : }
2459 :
2460 2300 : VLIB_NODE_FN (tcp6_reset_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2461 : vlib_frame_t * from_frame)
2462 : {
2463 0 : return tcp46_reset_inline (vm, node, from_frame, 0);
2464 : }
2465 :
2466 : /* *INDENT-OFF* */
2467 183788 : VLIB_REGISTER_NODE (tcp4_reset_node) = {
2468 : .name = "tcp4-reset",
2469 : .vector_size = sizeof (u32),
2470 : .n_errors = TCP_N_ERROR,
2471 : .error_counters = tcp_output_error_counters,
2472 : .n_next_nodes = TCP_RESET_N_NEXT,
2473 : .next_nodes = {
2474 : #define _(s,n) [TCP_RESET_NEXT_##s] = n,
2475 : foreach_tcp4_reset_next
2476 : #undef _
2477 : },
2478 : .format_trace = format_tcp_tx_trace,
2479 : };
2480 : /* *INDENT-ON* */
2481 :
2482 : /* *INDENT-OFF* */
2483 183788 : VLIB_REGISTER_NODE (tcp6_reset_node) = {
2484 : .name = "tcp6-reset",
2485 : .vector_size = sizeof (u32),
2486 : .n_errors = TCP_N_ERROR,
2487 : .error_counters = tcp_output_error_counters,
2488 : .n_next_nodes = TCP_RESET_N_NEXT,
2489 : .next_nodes = {
2490 : #define _(s,n) [TCP_RESET_NEXT_##s] = n,
2491 : foreach_tcp6_reset_next
2492 : #undef _
2493 : },
2494 : .format_trace = format_tcp_tx_trace,
2495 : };
2496 : /* *INDENT-ON* */
2497 :
2498 : /*
2499 : * fd.io coding-style-patch-verification: ON
2500 : *
2501 : * Local Variables:
2502 : * eval: (c-set-style "gnu")
2503 : * End:
2504 : */
|