Line data Source code
1 : /*
2 : *------------------------------------------------------------------
3 : * Copyright (c) 2018 Cisco and/or its affiliates.
4 : * Licensed under the Apache License, Version 2.0 (the "License");
5 : * you may not use this file except in compliance with the License.
6 : * You may obtain a copy of the License at:
7 : *
8 : * http://www.apache.org/licenses/LICENSE-2.0
9 : *
10 : * Unless required by applicable law or agreed to in writing, software
11 : * distributed under the License is distributed on an "AS IS" BASIS,
12 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 : * See the License for the specific language governing permissions and
14 : * limitations under the License.
15 : *------------------------------------------------------------------
16 : */
17 :
18 : #include <stdio.h>
19 : #include <net/if.h>
20 : #include <sys/ioctl.h>
21 : #include <linux/ethtool.h>
22 : #include <linux/if_link.h>
23 : #include <linux/sockios.h>
24 : #include <linux/limits.h>
25 : #include <bpf/bpf.h>
26 : #include <vlib/vlib.h>
27 : #include <vlib/unix/unix.h>
28 : #include <vlib/pci/pci.h>
29 : #include <vppinfra/linux/netns.h>
30 : #include <vppinfra/linux/sysfs.h>
31 : #include <vppinfra/unix.h>
32 : #include <vnet/ethernet/ethernet.h>
33 : #include <vnet/interface/rx_queue_funcs.h>
34 : #include <vnet/interface/tx_queue_funcs.h>
35 : #include "af_xdp.h"
36 :
37 : #ifndef XDP_UMEM_MIN_CHUNK_SIZE
38 : #define XDP_UMEM_MIN_CHUNK_SIZE 2048
39 : #endif
40 :
41 : af_xdp_main_t af_xdp_main;
42 :
43 : typedef struct
44 : {
45 : u32 prod;
46 : u32 cons;
47 : } gdb_af_xdp_pair_t;
48 :
49 : gdb_af_xdp_pair_t
50 0 : gdb_af_xdp_get_prod (const struct xsk_ring_prod *prod)
51 : {
52 0 : gdb_af_xdp_pair_t pair = { *prod->producer, *prod->consumer };
53 0 : return pair;
54 : }
55 :
56 : gdb_af_xdp_pair_t
57 0 : gdb_af_xdp_get_cons (const struct xsk_ring_cons * cons)
58 : {
59 0 : gdb_af_xdp_pair_t pair = { *cons->producer, *cons->consumer };
60 0 : return pair;
61 : }
62 :
63 : static clib_error_t *
64 0 : af_xdp_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new)
65 : {
66 0 : af_xdp_main_t *am = &af_xdp_main;
67 0 : af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance);
68 0 : errno_t err = memcpy_s (ad->hwaddr, sizeof (ad->hwaddr), new, 6);
69 0 : if (err)
70 0 : return clib_error_return_code (0, -err, CLIB_ERROR_ERRNO_VALID,
71 : "mac change failed");
72 0 : return 0;
73 : }
74 :
75 : static clib_error_t *
76 0 : af_xdp_set_max_frame_size (vnet_main_t *vnm, vnet_hw_interface_t *hw,
77 : u32 frame_size)
78 : {
79 0 : af_xdp_main_t *am = &af_xdp_main;
80 0 : af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance);
81 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set mtu not supported yet");
82 0 : return vnet_error (VNET_ERR_UNSUPPORTED, 0);
83 : }
84 :
85 : static u32
86 0 : af_xdp_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
87 : {
88 0 : af_xdp_main_t *am = &af_xdp_main;
89 0 : af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance);
90 :
91 0 : switch (flags)
92 : {
93 0 : case 0:
94 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set unicast not supported yet");
95 0 : return ~0;
96 0 : case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
97 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad,
98 : "set promiscuous not supported yet");
99 0 : return ~0;
100 : }
101 :
102 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "unknown flag %x requested", flags);
103 0 : return ~0;
104 : }
105 :
106 : int
107 0 : af_xdp_enter_netns (char *netns, int *fds)
108 : {
109 0 : *fds = *(fds + 1) = -1;
110 0 : if (netns != NULL)
111 : {
112 0 : *fds = clib_netns_open (NULL /* self */);
113 0 : if ((*(fds + 1) = clib_netns_open ((u8 *) netns)) == -1)
114 0 : return VNET_API_ERROR_SYSCALL_ERROR_8;
115 0 : if (clib_setns (*(fds + 1)) == -1)
116 0 : return VNET_API_ERROR_SYSCALL_ERROR_9;
117 : }
118 0 : return 0;
119 : }
120 :
121 : void
122 0 : af_xdp_cleanup_netns (int *fds)
123 : {
124 0 : if (*fds != -1)
125 0 : close (*fds);
126 :
127 0 : if (*(fds + 1) != -1)
128 0 : close (*(fds + 1));
129 :
130 0 : *fds = *(fds + 1) = -1;
131 0 : }
132 :
133 : int
134 0 : af_xdp_exit_netns (char *netns, int *fds)
135 : {
136 0 : int ret = 0;
137 0 : if (netns != NULL)
138 : {
139 0 : if (*fds != -1)
140 0 : ret = clib_setns (*fds);
141 :
142 0 : af_xdp_cleanup_netns (fds);
143 : }
144 :
145 0 : return ret;
146 : }
147 :
148 : static int
149 0 : af_xdp_remove_program (af_xdp_device_t *ad)
150 : {
151 0 : u32 curr_prog_id = 0;
152 : int ret;
153 : int ns_fds[2];
154 :
155 0 : af_xdp_enter_netns (ad->netns, ns_fds);
156 0 : ret = bpf_xdp_query_id (ad->linux_ifindex, XDP_FLAGS_UPDATE_IF_NOEXIST,
157 : &curr_prog_id);
158 0 : if (ret != 0)
159 : {
160 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "bpf_xdp_query_id failed\n");
161 0 : goto err0;
162 : }
163 :
164 0 : ret = bpf_xdp_detach (ad->linux_ifindex, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL);
165 0 : if (ret != 0)
166 : {
167 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "bpf_xdp_detach failed\n");
168 0 : goto err0;
169 : }
170 0 : af_xdp_exit_netns (ad->netns, ns_fds);
171 0 : if (ad->bpf_obj)
172 0 : bpf_object__close (ad->bpf_obj);
173 :
174 0 : return 0;
175 :
176 0 : err0:
177 0 : af_xdp_exit_netns (ad->netns, ns_fds);
178 0 : return ret;
179 : }
180 :
181 : void
182 0 : af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad)
183 : {
184 0 : vnet_main_t *vnm = vnet_get_main ();
185 0 : af_xdp_main_t *axm = &af_xdp_main;
186 : struct xsk_socket **xsk;
187 : struct xsk_umem **umem;
188 : int i;
189 :
190 0 : if (ad->hw_if_index)
191 : {
192 0 : vnet_hw_interface_set_flags (vnm, ad->hw_if_index, 0);
193 0 : ethernet_delete_interface (vnm, ad->hw_if_index);
194 : }
195 :
196 0 : for (i = 0; i < ad->txq_num; i++)
197 0 : clib_spinlock_free (&vec_elt (ad->txqs, i).lock);
198 :
199 0 : vec_foreach (xsk, ad->xsk)
200 0 : xsk_socket__delete (*xsk);
201 :
202 0 : vec_foreach (umem, ad->umem)
203 0 : xsk_umem__delete (*umem);
204 :
205 0 : for (i = 0; i < ad->rxq_num; i++)
206 0 : clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index);
207 :
208 0 : if (af_xdp_remove_program (ad) != 0)
209 0 : af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "Error while removing XDP program.\n");
210 :
211 0 : vec_free (ad->xsk);
212 0 : vec_free (ad->umem);
213 0 : vec_free (ad->buffer_template);
214 0 : vec_free (ad->rxqs);
215 0 : vec_free (ad->txqs);
216 0 : vec_free (ad->name);
217 0 : vec_free (ad->linux_ifname);
218 0 : vec_free (ad->netns);
219 0 : clib_error_free (ad->error);
220 0 : pool_put (axm->devices, ad);
221 0 : }
222 :
223 : static int
224 0 : af_xdp_load_program (af_xdp_create_if_args_t * args, af_xdp_device_t * ad)
225 : {
226 : int fd;
227 : struct bpf_program *bpf_prog;
228 0 : struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY };
229 :
230 0 : if (setrlimit (RLIMIT_MEMLOCK, &r))
231 0 : af_xdp_log (VLIB_LOG_LEVEL_WARNING, ad,
232 : "setrlimit(%s) failed: %s (errno %d)", ad->linux_ifname,
233 : strerror (errno), errno);
234 :
235 0 : ad->bpf_obj = bpf_object__open_file (args->prog, NULL);
236 0 : if (libbpf_get_error (ad->bpf_obj))
237 : {
238 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_5;
239 0 : args->error = clib_error_return_unix (
240 : 0, "bpf_object__open_file(%s) failed", args->prog);
241 0 : goto err0;
242 : }
243 :
244 0 : bpf_prog = bpf_object__next_program (ad->bpf_obj, NULL);
245 0 : if (!bpf_prog)
246 0 : goto err1;
247 :
248 0 : bpf_program__set_type (bpf_prog, BPF_PROG_TYPE_XDP);
249 :
250 0 : if (bpf_object__load (ad->bpf_obj))
251 0 : goto err1;
252 :
253 0 : fd = bpf_program__fd (bpf_prog);
254 :
255 0 : if (bpf_xdp_attach (ad->linux_ifindex, fd, XDP_FLAGS_UPDATE_IF_NOEXIST,
256 : NULL))
257 : {
258 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_6;
259 0 : args->error = clib_error_return_unix (0, "bpf_xdp_attach(%s) failed",
260 : ad->linux_ifname);
261 0 : goto err1;
262 : }
263 :
264 0 : return 0;
265 :
266 0 : err1:
267 0 : bpf_object__close (ad->bpf_obj);
268 0 : ad->bpf_obj = 0;
269 0 : err0:
270 0 : return -1;
271 : }
272 :
273 : static int
274 0 : af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args,
275 : af_xdp_device_t *ad, int qid)
276 : {
277 : struct xsk_umem **umem;
278 : struct xsk_socket **xsk;
279 : af_xdp_rxq_t *rxq;
280 : af_xdp_txq_t *txq;
281 : struct xsk_umem_config umem_config;
282 : struct xsk_socket_config sock_config;
283 : struct xdp_options opt;
284 : socklen_t optlen;
285 0 : const int is_rx = qid < ad->rxq_num;
286 0 : const int is_tx = qid < ad->txq_num;
287 :
288 0 : umem = vec_elt_at_index (ad->umem, qid);
289 0 : xsk = vec_elt_at_index (ad->xsk, qid);
290 0 : rxq = vec_elt_at_index (ad->rxqs, qid);
291 0 : txq = vec_elt_at_index (ad->txqs, qid);
292 :
293 : /*
294 : * fq and cq must always be allocated even if unused
295 : * whereas rx and tx indicates whether we want rxq, txq, or both
296 : */
297 0 : struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0;
298 0 : struct xsk_ring_prod *fq = &rxq->fq;
299 0 : struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0;
300 0 : struct xsk_ring_cons *cq = &txq->cq;
301 : int fd;
302 :
303 0 : memset (&umem_config, 0, sizeof (umem_config));
304 0 : umem_config.fill_size = args->rxq_size;
305 0 : umem_config.comp_size = args->txq_size;
306 0 : umem_config.frame_size =
307 0 : sizeof (vlib_buffer_t) + vlib_buffer_get_default_data_size (vm);
308 0 : umem_config.frame_headroom = sizeof (vlib_buffer_t);
309 0 : umem_config.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG;
310 0 : if (xsk_umem__create
311 0 : (umem, uword_to_pointer (vm->buffer_main->buffer_mem_start, void *),
312 0 : vm->buffer_main->buffer_mem_size, fq, cq, &umem_config))
313 : {
314 0 : uword sys_page_size = clib_mem_get_page_size ();
315 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_1;
316 0 : args->error = clib_error_return_unix (0, "xsk_umem__create() failed");
317 : /* this should mimic the Linux kernel net/xdp/xdp_umem.c:xdp_umem_reg()
318 : * check */
319 0 : if (umem_config.frame_size < XDP_UMEM_MIN_CHUNK_SIZE ||
320 0 : umem_config.frame_size > sys_page_size)
321 0 : args->error = clib_error_return (
322 : args->error,
323 : "(unsupported data-size? (should be between %d and %d))",
324 : XDP_UMEM_MIN_CHUNK_SIZE - sizeof (vlib_buffer_t),
325 : sys_page_size - sizeof (vlib_buffer_t));
326 0 : goto err0;
327 : }
328 :
329 0 : memset (&sock_config, 0, sizeof (sock_config));
330 0 : sock_config.rx_size = args->rxq_size;
331 0 : sock_config.tx_size = args->txq_size;
332 0 : sock_config.bind_flags = XDP_USE_NEED_WAKEUP;
333 0 : switch (args->mode)
334 : {
335 0 : case AF_XDP_MODE_AUTO:
336 0 : break;
337 0 : case AF_XDP_MODE_COPY:
338 0 : sock_config.bind_flags |= XDP_COPY;
339 0 : break;
340 0 : case AF_XDP_MODE_ZERO_COPY:
341 0 : sock_config.bind_flags |= XDP_ZEROCOPY;
342 0 : break;
343 : }
344 0 : if (args->prog)
345 0 : sock_config.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
346 0 : if (xsk_socket__create
347 0 : (xsk, ad->linux_ifname, qid, *umem, rx, tx, &sock_config))
348 : {
349 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
350 0 : args->error =
351 0 : clib_error_return_unix (0,
352 : "xsk_socket__create() failed (is linux netdev %s up?)",
353 : ad->linux_ifname);
354 0 : goto err1;
355 : }
356 :
357 0 : fd = xsk_socket__fd (*xsk);
358 0 : if (args->prog)
359 : {
360 : struct bpf_map *map =
361 0 : bpf_object__find_map_by_name (ad->bpf_obj, "xsks_map");
362 0 : int ret = xsk_socket__update_xskmap (*xsk, bpf_map__fd (map));
363 0 : if (ret)
364 : {
365 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_3;
366 0 : args->error = clib_error_return_unix (
367 : 0, "xsk_socket__update_xskmap %s qid %d return %d",
368 : ad->linux_ifname, qid, ret);
369 0 : goto err2;
370 : }
371 : }
372 0 : optlen = sizeof (opt);
373 : #ifndef SOL_XDP
374 : #define SOL_XDP 283
375 : #endif
376 0 : if (getsockopt (fd, SOL_XDP, XDP_OPTIONS, &opt, &optlen))
377 : {
378 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_4;
379 0 : args->error =
380 0 : clib_error_return_unix (0, "getsockopt(XDP_OPTIONS) failed");
381 0 : goto err2;
382 : }
383 0 : if (opt.flags & XDP_OPTIONS_ZEROCOPY)
384 0 : ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY;
385 :
386 0 : rxq->xsk_fd = is_rx ? fd : -1;
387 :
388 0 : if (is_tx)
389 : {
390 0 : txq->xsk_fd = fd;
391 0 : clib_spinlock_init (&txq->lock);
392 0 : if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK))
393 : {
394 : /* This is a shared rx+tx queue and we need to lock before syscalls.
395 : * Prior to Linux 5.6 there is a race condition preventing to call
396 : * poll() and sendto() concurrently on AF_XDP sockets. This was
397 : * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33
398 : * to workaround this issue, we protect the syscalls with a
399 : * spinlock. Note that it also prevents to use interrupt mode in
400 : * multi workers setup, because in this case the poll() is done in
401 : * the framework w/o any possibility to protect it.
402 : * See
403 : * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
404 : */
405 0 : clib_spinlock_init (&rxq->syscall_lock);
406 0 : txq->syscall_lock = rxq->syscall_lock;
407 : }
408 : }
409 : else
410 : {
411 0 : txq->xsk_fd = -1;
412 : }
413 :
414 0 : return 0;
415 :
416 0 : err2:
417 0 : xsk_socket__delete (*xsk);
418 0 : err1:
419 0 : xsk_umem__delete (*umem);
420 0 : err0:
421 0 : *umem = 0;
422 0 : *xsk = 0;
423 0 : return -1;
424 : }
425 :
426 : static int
427 0 : af_xdp_get_numa (const char *ifname)
428 : {
429 : char *path;
430 : clib_error_t *err;
431 : int numa;
432 :
433 0 : path =
434 0 : (char *) format (0, "/sys/class/net/%s/device/numa_node%c", ifname, 0);
435 0 : err = clib_sysfs_read (path, "%d", &numa);
436 0 : if (err || numa < 0)
437 0 : numa = 0;
438 :
439 0 : clib_error_free (err);
440 0 : vec_free (path);
441 0 : return numa;
442 : }
443 :
444 : static void
445 0 : af_xdp_get_q_count (const char *ifname, int *rxq_num, int *txq_num)
446 : {
447 0 : struct ethtool_channels ec = { .cmd = ETHTOOL_GCHANNELS };
448 0 : struct ifreq ifr = { .ifr_data = (void *) &ec };
449 : int fd, err;
450 :
451 0 : *rxq_num = *txq_num = 1;
452 :
453 0 : fd = socket (AF_INET, SOCK_DGRAM, 0);
454 0 : if (fd < 0)
455 0 : return;
456 :
457 0 : snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", ifname);
458 0 : err = ioctl (fd, SIOCETHTOOL, &ifr);
459 :
460 0 : close (fd);
461 :
462 0 : if (err)
463 0 : return;
464 :
465 0 : *rxq_num = clib_max (ec.combined_count, ec.rx_count);
466 0 : *txq_num = clib_max (ec.combined_count, ec.tx_count);
467 : }
468 :
469 : static clib_error_t *
470 0 : af_xdp_device_rxq_read_ready (clib_file_t * f)
471 : {
472 0 : vnet_hw_if_rx_queue_set_int_pending (vnet_get_main (), f->private_data);
473 0 : return 0;
474 : }
475 :
476 : static clib_error_t *
477 0 : af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq,
478 : const af_xdp_rxq_mode_t mode)
479 : {
480 0 : clib_file_main_t *fm = &file_main;
481 : clib_file_update_type_t update;
482 : clib_file_t *f;
483 :
484 0 : if (rxq->mode == mode)
485 0 : return 0;
486 :
487 0 : switch (mode)
488 : {
489 0 : case AF_XDP_RXQ_MODE_POLLING:
490 0 : update = UNIX_FILE_UPDATE_DELETE;
491 0 : break;
492 0 : case AF_XDP_RXQ_MODE_INTERRUPT:
493 0 : if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)
494 0 : return clib_error_create (
495 : "kernel workaround incompatible with interrupt mode");
496 0 : update = UNIX_FILE_UPDATE_ADD;
497 0 : break;
498 0 : default:
499 0 : ASSERT (0);
500 0 : return clib_error_create ("unknown rxq mode %i", mode);
501 : }
502 :
503 0 : f = clib_file_get (fm, rxq->file_index);
504 0 : fm->file_update (f, update);
505 0 : rxq->mode = mode;
506 0 : return 0;
507 : }
508 :
509 : static u32
510 0 : af_xdp_find_rxq_for_thread (vnet_main_t *vnm, const af_xdp_device_t *ad,
511 : const u32 thread)
512 : {
513 : u32 i;
514 0 : for (i = 0; i < ad->rxq_num; i++)
515 : {
516 0 : const u32 qid = vec_elt (ad->rxqs, i).queue_index;
517 0 : const u32 tid = vnet_hw_if_get_rx_queue (vnm, qid)->thread_index;
518 0 : if (tid == thread)
519 0 : return i;
520 : }
521 0 : return ~0;
522 : }
523 :
524 : static clib_error_t *
525 0 : af_xdp_finalize_queues (vnet_main_t *vnm, af_xdp_device_t *ad,
526 : const int n_vlib_mains)
527 : {
528 0 : clib_error_t *err = 0;
529 : int i;
530 :
531 0 : for (i = 0; i < ad->rxq_num; i++)
532 : {
533 0 : af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i);
534 0 : rxq->queue_index = vnet_hw_if_register_rx_queue (
535 : vnm, ad->hw_if_index, i, VNET_HW_IF_RXQ_THREAD_ANY);
536 0 : u8 *desc = format (0, "%U rxq %d", format_af_xdp_device_name,
537 : ad->dev_instance, i);
538 0 : clib_file_t f = {
539 0 : .file_descriptor = rxq->xsk_fd,
540 0 : .private_data = rxq->queue_index,
541 : .read_function = af_xdp_device_rxq_read_ready,
542 : .description = desc,
543 : };
544 0 : rxq->file_index = clib_file_add (&file_main, &f);
545 0 : vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index,
546 0 : rxq->file_index);
547 0 : err = af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING);
548 0 : if (err)
549 0 : return err;
550 : }
551 :
552 0 : for (i = 0; i < ad->txq_num; i++)
553 0 : vec_elt (ad->txqs, i).queue_index =
554 0 : vnet_hw_if_register_tx_queue (vnm, ad->hw_if_index, i);
555 :
556 : /* We set the rxq and txq of the same queue pair on the same thread
557 : * by default to avoid locking because of the syscall lock. */
558 0 : int last_qid = clib_min (ad->rxq_num, ad->txq_num - 1);
559 0 : for (i = 0; i < n_vlib_mains; i++)
560 : {
561 : /* search for the 1st rxq assigned on this thread, if any */
562 0 : u32 qid = af_xdp_find_rxq_for_thread (vnm, ad, i);
563 : /* if this rxq is combined with a txq, use it. Otherwise, we'll
564 : * assign txq in a round-robin fashion. We start from the 1st txq
565 : * not shared with a rxq if possible... */
566 0 : qid = qid < ad->txq_num ? qid : (last_qid++ % ad->txq_num);
567 0 : vnet_hw_if_tx_queue_assign_thread (
568 0 : vnm, vec_elt (ad->txqs, qid).queue_index, i);
569 : }
570 :
571 0 : vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index);
572 0 : return 0;
573 : }
574 :
575 : void
576 0 : af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
577 : {
578 0 : vnet_main_t *vnm = vnet_get_main ();
579 0 : vlib_thread_main_t *tm = vlib_get_thread_main ();
580 0 : vnet_eth_interface_registration_t eir = {};
581 0 : af_xdp_main_t *am = &af_xdp_main;
582 : af_xdp_device_t *ad;
583 : vnet_sw_interface_t *sw;
584 : int rxq_num, txq_num, q_num;
585 : int ns_fds[2];
586 : int i, ret;
587 :
588 0 : args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE;
589 0 : args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE;
590 0 : args->rxq_num = args->rxq_num ? args->rxq_num : 1;
591 :
592 0 : if (!args->linux_ifname)
593 : {
594 0 : args->rv = VNET_API_ERROR_INVALID_VALUE;
595 0 : args->error = clib_error_return (0, "missing host interface");
596 0 : goto err0;
597 : }
598 :
599 0 : if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
600 0 : args->rxq_size > 65535 || args->txq_size > 65535 ||
601 0 : !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
602 : {
603 0 : args->rv = VNET_API_ERROR_INVALID_VALUE;
604 0 : args->error =
605 0 : clib_error_return (0,
606 : "queue size must be a power of two between %i and 65535",
607 : VLIB_FRAME_SIZE);
608 0 : goto err0;
609 : }
610 :
611 0 : ret = af_xdp_enter_netns (args->netns, ns_fds);
612 0 : if (ret)
613 : {
614 0 : args->rv = ret;
615 0 : args->error = clib_error_return (0, "enter netns %s failed, ret %d",
616 : args->netns, args->rv);
617 0 : goto err0;
618 : }
619 :
620 0 : af_xdp_get_q_count (args->linux_ifname, &rxq_num, &txq_num);
621 0 : if (args->rxq_num > rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num)
622 : {
623 0 : args->rv = VNET_API_ERROR_INVALID_VALUE;
624 0 : args->error = clib_error_create ("too many rxq requested (%d > %d)",
625 : args->rxq_num, rxq_num);
626 0 : goto err1;
627 : }
628 0 : rxq_num = clib_min (rxq_num, args->rxq_num);
629 0 : txq_num = clib_min (txq_num, tm->n_vlib_mains);
630 :
631 0 : pool_get_zero (am->devices, ad);
632 :
633 0 : if (tm->n_vlib_mains > 1 &&
634 0 : 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK))
635 0 : ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK;
636 :
637 0 : ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname);
638 0 : vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */
639 :
640 0 : if (args->netns)
641 0 : ad->netns = (char *) format (0, "%s%c", args->netns, 0);
642 :
643 0 : ad->linux_ifindex = if_nametoindex (ad->linux_ifname);
644 0 : if (!ad->linux_ifindex)
645 : {
646 0 : args->rv = VNET_API_ERROR_INVALID_VALUE;
647 0 : args->error = clib_error_return_unix (0, "if_nametoindex(%s) failed",
648 : ad->linux_ifname);
649 0 : ad->linux_ifindex = ~0;
650 0 : goto err1;
651 : }
652 :
653 0 : if (args->prog &&
654 0 : (af_xdp_remove_program (ad) || af_xdp_load_program (args, ad)))
655 0 : goto err2;
656 :
657 0 : q_num = clib_max (rxq_num, txq_num);
658 0 : ad->rxq_num = rxq_num;
659 0 : ad->txq_num = txq_num;
660 :
661 0 : vec_validate_aligned (ad->umem, q_num - 1, CLIB_CACHE_LINE_BYTES);
662 0 : vec_validate_aligned (ad->xsk, q_num - 1, CLIB_CACHE_LINE_BYTES);
663 0 : vec_validate_aligned (ad->rxqs, q_num - 1, CLIB_CACHE_LINE_BYTES);
664 0 : vec_validate_aligned (ad->txqs, q_num - 1, CLIB_CACHE_LINE_BYTES);
665 :
666 0 : for (i = 0; i < q_num; i++)
667 : {
668 0 : if (af_xdp_create_queue (vm, args, ad, i))
669 : {
670 : /*
671 : * queue creation failed
672 : * it is only a fatal error if we could not create the number of rx
673 : * queues requested explicitely by the user and the user did not
674 : * requested 'max'
675 : * we might create less tx queues than workers but this is ok
676 : */
677 0 : af_xdp_log (VLIB_LOG_LEVEL_DEBUG, ad,
678 : "create interface failed to create queue qid=%d", i);
679 :
680 : /* fixup vectors length */
681 0 : vec_set_len (ad->umem, i);
682 0 : vec_set_len (ad->xsk, i);
683 0 : vec_set_len (ad->rxqs, i);
684 0 : vec_set_len (ad->txqs, i);
685 :
686 0 : ad->rxq_num = clib_min (i, rxq_num);
687 0 : ad->txq_num = clib_min (i, txq_num);
688 :
689 0 : if (i == 0 ||
690 0 : (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num))
691 : {
692 0 : ad->rxq_num = ad->txq_num = 0;
693 0 : goto err2; /* failed creating requested rxq: fatal error, bailing
694 : out */
695 : }
696 :
697 :
698 0 : args->rv = 0;
699 0 : clib_error_free (args->error);
700 0 : break;
701 : }
702 : }
703 :
704 0 : if (af_xdp_exit_netns (args->netns, ns_fds))
705 : {
706 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_10;
707 0 : args->error = clib_error_return (0, "exit netns failed");
708 0 : goto err2;
709 : }
710 :
711 0 : ad->dev_instance = ad - am->devices;
712 0 : ad->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
713 0 : ad->pool =
714 0 : vlib_buffer_pool_get_default_for_numa (vm,
715 0 : af_xdp_get_numa
716 0 : (ad->linux_ifname));
717 0 : if (!args->name)
718 : {
719 0 : char *ifname = ad->linux_ifname;
720 0 : if (args->netns != NULL && strncmp (args->netns, "pid:", 4) == 0)
721 : {
722 0 : ad->name =
723 0 : (char *) format (0, "%s/%u", ifname, atoi (args->netns + 4));
724 : }
725 : else
726 0 : ad->name = (char *) format (0, "%s/%d", ifname, ad->dev_instance);
727 : }
728 : else
729 0 : ad->name = (char *) format (0, "%s", args->name);
730 :
731 0 : ethernet_mac_address_generate (ad->hwaddr);
732 :
733 : /* create interface */
734 0 : eir.dev_class_index = af_xdp_device_class.index;
735 0 : eir.dev_instance = ad->dev_instance;
736 0 : eir.address = ad->hwaddr;
737 0 : eir.cb.flag_change = af_xdp_flag_change;
738 0 : eir.cb.set_max_frame_size = af_xdp_set_max_frame_size;
739 0 : ad->hw_if_index = vnet_eth_register_interface (vnm, &eir);
740 :
741 0 : sw = vnet_get_hw_sw_interface (vnm, ad->hw_if_index);
742 0 : args->sw_if_index = ad->sw_if_index = sw->sw_if_index;
743 :
744 0 : vnet_hw_if_set_caps (vnm, ad->hw_if_index, VNET_HW_IF_CAP_INT_MODE);
745 :
746 0 : vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index);
747 :
748 0 : args->error = af_xdp_finalize_queues (vnm, ad, tm->n_vlib_mains);
749 0 : if (args->error)
750 : {
751 0 : args->rv = VNET_API_ERROR_SYSCALL_ERROR_7;
752 0 : goto err2;
753 : }
754 :
755 : /* buffer template */
756 0 : vec_validate_aligned (ad->buffer_template, 1, CLIB_CACHE_LINE_BYTES);
757 0 : ad->buffer_template->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
758 0 : ad->buffer_template->ref_count = 1;
759 0 : vnet_buffer (ad->buffer_template)->sw_if_index[VLIB_RX] = ad->sw_if_index;
760 0 : vnet_buffer (ad->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0;
761 0 : ad->buffer_template->buffer_pool_index = ad->pool;
762 :
763 0 : return;
764 :
765 0 : err2:
766 0 : af_xdp_delete_if (vm, ad);
767 0 : err1:
768 0 : af_xdp_cleanup_netns (ns_fds);
769 0 : err0:
770 0 : vlib_log_err (am->log_class, "%U", format_clib_error, args->error);
771 : }
772 :
773 : static clib_error_t *
774 0 : af_xdp_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
775 : {
776 0 : vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
777 0 : af_xdp_main_t *am = &af_xdp_main;
778 0 : af_xdp_device_t *ad = vec_elt_at_index (am->devices, hi->dev_instance);
779 0 : uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
780 :
781 0 : if (ad->flags & AF_XDP_DEVICE_F_ERROR)
782 0 : return clib_error_return (0, "device is in error state");
783 :
784 0 : if (is_up)
785 : {
786 0 : vnet_hw_interface_set_flags (vnm, ad->hw_if_index,
787 : VNET_HW_INTERFACE_FLAG_LINK_UP);
788 0 : ad->flags |= AF_XDP_DEVICE_F_ADMIN_UP;
789 0 : af_xdp_device_input_refill (ad);
790 : }
791 : else
792 : {
793 0 : vnet_hw_interface_set_flags (vnm, ad->hw_if_index, 0);
794 0 : ad->flags &= ~AF_XDP_DEVICE_F_ADMIN_UP;
795 : }
796 0 : return 0;
797 : }
798 :
799 : static clib_error_t *
800 0 : af_xdp_interface_rx_mode_change (vnet_main_t *vnm, u32 hw_if_index, u32 qid,
801 : vnet_hw_if_rx_mode mode)
802 : {
803 0 : af_xdp_main_t *am = &af_xdp_main;
804 0 : vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
805 0 : af_xdp_device_t *ad = pool_elt_at_index (am->devices, hw->dev_instance);
806 0 : af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid);
807 :
808 0 : switch (mode)
809 : {
810 0 : default: /* fallthrough */
811 : case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */
812 : case VNET_HW_IF_NUM_RX_MODES:
813 0 : return clib_error_create ("uknown rx mode - doing nothing");
814 0 : case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */
815 : case VNET_HW_IF_RX_MODE_POLLING:
816 0 : return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING);
817 0 : case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */
818 : case VNET_HW_IF_RX_MODE_ADAPTIVE:
819 0 : return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT);
820 : }
821 :
822 : ASSERT (0 && "unreachable");
823 : return clib_error_create ("unreachable");
824 : }
825 :
826 : static void
827 0 : af_xdp_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
828 : u32 node_index)
829 : {
830 0 : af_xdp_main_t *am = &af_xdp_main;
831 0 : vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
832 0 : af_xdp_device_t *ad = pool_elt_at_index (am->devices, hw->dev_instance);
833 :
834 : /* Shut off redirection */
835 0 : if (node_index == ~0)
836 : {
837 0 : ad->per_interface_next_index = node_index;
838 0 : return;
839 : }
840 :
841 0 : ad->per_interface_next_index =
842 0 : vlib_node_add_next (vlib_get_main (), af_xdp_input_node.index,
843 : node_index);
844 : }
845 :
846 : static char *af_xdp_tx_func_error_strings[] = {
847 : #define _(n,s) s,
848 : foreach_af_xdp_tx_func_error
849 : #undef _
850 : };
851 :
852 : static void
853 0 : af_xdp_clear (u32 dev_instance)
854 : {
855 0 : af_xdp_main_t *am = &af_xdp_main;
856 0 : af_xdp_device_t *ad = pool_elt_at_index (am->devices, dev_instance);
857 0 : clib_error_free (ad->error);
858 0 : }
859 :
860 : /* *INDENT-OFF* */
861 10079 : VNET_DEVICE_CLASS (af_xdp_device_class) = {
862 : .name = "AF_XDP interface",
863 : .format_device = format_af_xdp_device,
864 : .format_device_name = format_af_xdp_device_name,
865 : .admin_up_down_function = af_xdp_interface_admin_up_down,
866 : .rx_mode_change_function = af_xdp_interface_rx_mode_change,
867 : .rx_redirect_to_node = af_xdp_set_interface_next_node,
868 : .tx_function_n_errors = AF_XDP_TX_N_ERROR,
869 : .tx_function_error_strings = af_xdp_tx_func_error_strings,
870 : .mac_addr_change_function = af_xdp_mac_change,
871 : .clear_counters = af_xdp_clear,
872 : };
873 : /* *INDENT-ON* */
874 :
875 : clib_error_t *
876 559 : af_xdp_init (vlib_main_t * vm)
877 : {
878 559 : af_xdp_main_t *am = &af_xdp_main;
879 :
880 559 : am->log_class = vlib_log_register_class ("af_xdp", 0);
881 :
882 559 : return 0;
883 : }
884 :
885 1119 : VLIB_INIT_FUNCTION (af_xdp_init);
886 :
887 : /*
888 : * fd.io coding-style-patch-verification: ON
889 : *
890 : * Local Variables:
891 : * eval: (c-set-style "gnu")
892 : * End:
893 : */
|