Line data Source code
1 : /*
2 : * Copyright (c) 2016 Cisco and/or its affiliates.
3 : * Licensed under the Apache License, Version 2.0 (the "License");
4 : * you may not use this file except in compliance with the License.
5 : * You may obtain a copy of the License at:
6 : *
7 : * http://www.apache.org/licenses/LICENSE-2.0
8 : *
9 : * Unless required by applicable law or agreed to in writing, software
10 : * distributed under the License is distributed on an "AS IS" BASIS,
11 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : * See the License for the specific language governing permissions and
13 : * limitations under the License.
14 : */
15 :
16 : /**
17 : * lb-plugin implements a MagLev-like load balancer.
18 : * http://research.google.com/pubs/pub44824.html
19 : *
20 : * It hasn't been tested for interoperability with the original MagLev
21 : * but intends to provide similar functionality.
22 : * The load-balancer receives traffic destined to VIP (Virtual IP)
23 : * addresses from one or multiple(ECMP) routers.
24 : * The load-balancer tunnels the traffic toward many application servers
25 : * ensuring session stickiness (i.e. that a single sessions is tunneled
26 : * towards a single application server).
27 : *
28 : */
29 :
30 : #ifndef LB_PLUGIN_LB_LB_H_
31 : #define LB_PLUGIN_LB_LB_H_
32 :
33 : #include <lb/util.h>
34 : #include <vnet/util/refcount.h>
35 :
36 : #include <vnet/vnet.h>
37 : #include <vnet/ip/ip.h>
38 : #include <vnet/dpo/dpo.h>
39 : #include <vnet/fib/fib_table.h>
40 : #include <vppinfra/hash.h>
41 : #include <vppinfra/bihash_8_8.h>
42 : #include <vppinfra/bihash_24_8.h>
43 : #include <lb/lbhash.h>
44 : #include <vppinfra/lock.h>
45 :
46 : #define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10
47 : #define LB_DEFAULT_FLOW_TIMEOUT 40
48 : #define LB_MAPPING_BUCKETS 1024
49 : #define LB_MAPPING_MEMORY_SIZE 64<<20
50 :
51 : #define LB_VIP_PER_PORT_BUCKETS 1024
52 : #define LB_VIP_PER_PORT_MEMORY_SIZE 64<<20
53 :
54 : typedef enum {
55 : LB_NEXT_DROP,
56 : LB_N_NEXT,
57 : } lb_next_t;
58 :
59 : typedef enum {
60 : LB_NAT4_IN2OUT_NEXT_DROP,
61 : LB_NAT4_IN2OUT_NEXT_LOOKUP,
62 : LB_NAT4_IN2OUT_N_NEXT,
63 : } LB_nat4_in2out_next_t;
64 :
65 : typedef enum {
66 : LB_NAT6_IN2OUT_NEXT_DROP,
67 : LB_NAT6_IN2OUT_NEXT_LOOKUP,
68 : LB_NAT6_IN2OUT_N_NEXT,
69 : } LB_nat6_in2out_next_t;
70 :
71 : #define foreach_lb_nat_in2out_error \
72 : _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \
73 : _(IN2OUT_PACKETS, "Good in2out packets processed") \
74 : _(NO_TRANSLATION, "No translation")
75 :
76 : typedef enum {
77 : #define _(sym,str) LB_NAT_IN2OUT_ERROR_##sym,
78 : foreach_lb_nat_in2out_error
79 : #undef _
80 : LB_NAT_IN2OUT_N_ERROR,
81 : } lb_nat_in2out_error_t;
82 :
83 : /**
84 : * lb for kube-proxy supports three types of service
85 : */
86 : typedef enum {
87 : LB_SRV_TYPE_CLUSTERIP,
88 : LB_SRV_TYPE_NODEPORT,
89 : LB_SRV_N_TYPES,
90 : } lb_svr_type_t;
91 :
92 : typedef enum {
93 : LB4_NODEPORT_NEXT_IP4_NAT4,
94 : LB4_NODEPORT_NEXT_DROP,
95 : LB4_NODEPORT_N_NEXT,
96 : } lb4_nodeport_next_t;
97 :
98 : typedef enum {
99 : LB6_NODEPORT_NEXT_IP6_NAT6,
100 : LB6_NODEPORT_NEXT_DROP,
101 : LB6_NODEPORT_N_NEXT,
102 : } lb6_nodeport_next_t;
103 :
104 : /**
105 : * Each VIP is configured with a set of
106 : * application server.
107 : */
108 : typedef struct {
109 : /**
110 : * Registration to FIB event.
111 : */
112 : fib_node_t fib_node;
113 :
114 : /**
115 : * Destination address used to tunnel traffic towards
116 : * that application server.
117 : * The address is also used as ID and pseudo-random
118 : * seed for the load-balancing process.
119 : */
120 : ip46_address_t address;
121 :
122 : /**
123 : * ASs are indexed by address and VIP Index.
124 : * Which means there will be duplicated if the same server
125 : * address is used for multiple VIPs.
126 : */
127 : u32 vip_index;
128 :
129 : /**
130 : * Some per-AS flags.
131 : * For now only LB_AS_FLAGS_USED is defined.
132 : */
133 : u8 flags;
134 :
135 : #define LB_AS_FLAGS_USED 0x1
136 :
137 : /**
138 : * Rotating timestamp of when LB_AS_FLAGS_USED flag was last set.
139 : *
140 : * AS removal is based on garbage collection and reference counting.
141 : * When an AS is removed, there is a race between configuration core
142 : * and worker cores which may still add a reference while it should not
143 : * be used. This timestamp is used to not remove the AS while a race condition
144 : * may happen.
145 : */
146 : u32 last_used;
147 :
148 : /**
149 : * The FIB entry index for the next-hop
150 : */
151 : fib_node_index_t next_hop_fib_entry_index;
152 :
153 : /**
154 : * The child index on the FIB entry
155 : */
156 : u32 next_hop_child_index;
157 :
158 : /**
159 : * The next DPO in the graph to follow.
160 : */
161 : dpo_id_t dpo;
162 :
163 : } lb_as_t;
164 :
165 : format_function_t format_lb_as;
166 :
167 : typedef struct {
168 : u32 as_index;
169 : } lb_new_flow_entry_t;
170 :
171 : #define lb_foreach_vip_counter \
172 : _(NEXT_PACKET, "packet from existing sessions", 0) \
173 : _(FIRST_PACKET, "first session packet", 1) \
174 : _(UNTRACKED_PACKET, "untracked packet", 2) \
175 : _(NO_SERVER, "no server configured", 3)
176 :
177 : typedef enum {
178 : #define _(a,b,c) LB_VIP_COUNTER_##a = c,
179 : lb_foreach_vip_counter
180 : #undef _
181 : LB_N_VIP_COUNTERS
182 : } lb_vip_counter_t;
183 :
184 : typedef enum {
185 : LB_ENCAP_TYPE_GRE4,
186 : LB_ENCAP_TYPE_GRE6,
187 : LB_ENCAP_TYPE_L3DSR,
188 : LB_ENCAP_TYPE_NAT4,
189 : LB_ENCAP_TYPE_NAT6,
190 : LB_ENCAP_N_TYPES,
191 : } lb_encap_type_t;
192 :
193 : /**
194 : * Lookup type
195 : */
196 :
197 : typedef enum {
198 : LB_LKP_SAME_IP_PORT,
199 : LB_LKP_DIFF_IP_PORT,
200 : LB_LKP_ALL_PORT_IP,
201 : LB_LKP_N_TYPES,
202 : } lb_lkp_type_t;
203 :
204 : /**
205 : * The load balancer supports IPv4 and IPv6 traffic
206 : * and GRE4, GRE6, L3DSR and NAT4, NAT6 encap.
207 : */
208 : typedef enum {
209 : LB_VIP_TYPE_IP6_GRE6,
210 : LB_VIP_TYPE_IP6_GRE4,
211 : LB_VIP_TYPE_IP4_GRE6,
212 : LB_VIP_TYPE_IP4_GRE4,
213 : LB_VIP_TYPE_IP4_L3DSR,
214 : LB_VIP_TYPE_IP4_NAT4,
215 : LB_VIP_TYPE_IP6_NAT6,
216 : LB_VIP_N_TYPES,
217 : } lb_vip_type_t;
218 :
219 : format_function_t format_lb_vip_type;
220 : unformat_function_t unformat_lb_vip_type;
221 :
222 :
223 : /* args for different vip encap types */
224 : typedef struct {
225 : union
226 : {
227 : struct
228 : {
229 : /* Service type. clusterip or nodeport */
230 : u8 srv_type;
231 :
232 : /* Pod's port corresponding to specific service. network byte order */
233 : u16 target_port;
234 : };
235 : /* DSCP bits for L3DSR */
236 : u8 dscp;
237 : u64 as_u64;
238 : };
239 : } lb_vip_encap_args_t;
240 :
241 : typedef struct {
242 : /* all fields in NET byte order */
243 : union {
244 : struct {
245 : u32 vip_prefix_index;
246 : u16 port;
247 : u8 protocol;
248 : u8 rsv;
249 : };
250 : u64 as_u64;
251 : };
252 : } vip_port_key_t;
253 :
254 : /**
255 : * Load balancing service is provided per VIP+protocol+port.
256 : * In this data model, a VIP can be a whole prefix.
257 : * But load balancing only
258 : * occurs on a per-source-address/port basis. Meaning that if a given source
259 : * reuses the same port for multiple destinations within the same VIP,
260 : * they will be considered as a single flow.
261 : */
262 : typedef struct {
263 :
264 : //Runtime
265 :
266 : /**
267 : * Vector mapping (flow-hash & new_connect_table_mask) to AS index.
268 : * This is used for new flows.
269 : */
270 : lb_new_flow_entry_t *new_flow_table;
271 :
272 : /**
273 : * New flows table length - 1
274 : * (length MUST be a power of 2)
275 : */
276 : u32 new_flow_table_mask;
277 :
278 : /**
279 : * Last time garbage collection was run to free the ASs.
280 : */
281 : u32 last_garbage_collection;
282 :
283 : //Not runtime
284 :
285 : /**
286 : * A Virtual IP represents a given service delivered
287 : * by a set of application servers. It can be a single
288 : * address or a prefix.
289 : * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address
290 : * (i.e. ::/96 prefix).
291 : */
292 : ip46_address_t prefix;
293 :
294 : /**
295 : * The VIP prefix length.
296 : * In case of IPv4, plen = 96 + ip4_plen.
297 : */
298 : u8 plen;
299 :
300 : /* tcp or udp. If not per-port vip, set to ~0 */
301 : u8 protocol;
302 :
303 : /* tcp port or udp port. If not per-port vip, set to ~0 */
304 : u16 port;
305 :
306 : /* Valid for per-port vip */
307 : u32 vip_prefix_index;
308 :
309 : /**
310 : * The type of traffic for this.
311 : * LB_TYPE_UNDEFINED if unknown.
312 : */
313 : lb_vip_type_t type;
314 :
315 : /* args for different vip encap types */
316 : lb_vip_encap_args_t encap_args;
317 :
318 : /**
319 : * Flags related to this VIP.
320 : * LB_VIP_FLAGS_USED means the VIP is active.
321 : * When it is not set, the VIP in the process of being removed.
322 : * We cannot immediately remove a VIP because the VIP index still may be stored
323 : * in the adjacency index.
324 : */
325 : u8 flags;
326 : #define LB_VIP_FLAGS_USED 0x1
327 : #define LB_VIP_FLAGS_SRC_IP_STICKY 0x2
328 :
329 : /**
330 : * Pool of AS indexes used for this VIP.
331 : * This also includes ASs that have been removed (but are still referenced).
332 : */
333 : u32 *as_indexes;
334 : } lb_vip_t;
335 :
336 : #define lb_vip_is_ip4(type) (type == LB_VIP_TYPE_IP4_GRE6 \
337 : || type == LB_VIP_TYPE_IP4_GRE4 \
338 : || type == LB_VIP_TYPE_IP4_L3DSR \
339 : || type == LB_VIP_TYPE_IP4_NAT4 )
340 :
341 : #define lb_vip_is_ip6(type) (type == LB_VIP_TYPE_IP6_GRE6 \
342 : || type == LB_VIP_TYPE_IP6_GRE4 \
343 : || type == LB_VIP_TYPE_IP6_NAT6 )
344 :
345 : #define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
346 : || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
347 : || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \
348 : || (vip)->type == LB_VIP_TYPE_IP4_NAT4 )
349 :
350 : #define lb_vip_is_src_ip_sticky(vip) \
351 : (((vip)->flags & LB_VIP_FLAGS_SRC_IP_STICKY) != 0)
352 :
353 : /* clang-format off */
354 : #define lb_vip_is_gre4(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
355 : || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \
356 : && ((vip)->port == 0))
357 :
358 : #define lb_vip_is_gre6(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
359 : || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
360 : && ((vip)->port == 0))
361 :
362 : #define lb_vip_is_gre4_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
363 : || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \
364 : && ((vip)->port != 0))
365 :
366 : #define lb_vip_is_gre6_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
367 : || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
368 : && ((vip)->port != 0))
369 : /* clang-format on */
370 :
371 : always_inline bool
372 136 : lb_vip_is_l3dsr(const lb_vip_t *vip)
373 : {
374 136 : return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port == 0);
375 : }
376 :
377 : always_inline bool
378 79 : lb_vip_is_l3dsr_port(const lb_vip_t *vip)
379 : {
380 79 : return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port != 0);
381 : }
382 : always_inline bool
383 120 : lb_vip_is_nat4_port(const lb_vip_t *vip)
384 : {
385 120 : return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port != 0);
386 : }
387 : always_inline bool
388 88 : lb_vip_is_nat6_port(const lb_vip_t *vip)
389 : {
390 88 : return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port != 0);
391 : }
392 :
393 : format_function_t format_lb_vip;
394 : format_function_t format_lb_vip_detailed;
395 :
396 : #define foreach_lb_nat_protocol \
397 : _(UDP, 0, udp, "udp") \
398 : _(TCP, 1, tcp, "tcp")
399 :
400 : typedef enum {
401 : #define _(N, i, n, s) LB_NAT_PROTOCOL_##N = i,
402 : foreach_lb_nat_protocol
403 : #undef _
404 : } lb_nat_protocol_t;
405 :
406 : always_inline u32
407 0 : lb_ip_proto_to_nat_proto (u8 ip_proto)
408 : {
409 0 : u32 nat_proto = ~0;
410 :
411 0 : nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? LB_NAT_PROTOCOL_UDP : nat_proto;
412 0 : nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? LB_NAT_PROTOCOL_TCP : nat_proto;
413 :
414 0 : return nat_proto;
415 : }
416 :
417 : /* Key for Pod's egress SNAT */
418 : typedef struct {
419 : union
420 : {
421 : struct
422 : {
423 : ip4_address_t addr;
424 : u16 port;
425 : u16 protocol:3,
426 : fib_index:13;
427 : };
428 : u64 as_u64;
429 : };
430 : } lb_snat4_key_t;
431 :
432 : typedef struct
433 : {
434 : union
435 : {
436 : struct
437 : {
438 : ip6_address_t addr;
439 : u16 port;
440 : u16 protocol;
441 : u32 fib_index;
442 : };
443 : u64 as_u64[3];
444 : };
445 : } lb_snat6_key_t;
446 :
447 : typedef struct {
448 : /**
449 : * for vip + port case, src_ip = vip;
450 : * for node ip + node_port, src_ip = node_ip
451 : */
452 : ip46_address_t src_ip;
453 : ip46_address_t as_ip;
454 : u8 src_ip_is_ipv6;
455 : u8 as_ip_is_ipv6;
456 : /**
457 : * Network byte order
458 : * for vip + port case, src_port = port;
459 : * for node ip + node_port, src_port = node_port
460 : */
461 : u16 src_port;
462 : u16 target_port; /* Network byte order */
463 : u32 vrf_id;
464 : u32 fib_index;
465 : } lb_snat_mapping_t;
466 :
467 : typedef struct {
468 : /**
469 : * Each CPU has its own sticky flow hash table.
470 : * One single table is used for all VIPs.
471 : */
472 : lb_hash_t *sticky_ht;
473 : } lb_per_cpu_t;
474 :
475 : typedef struct {
476 : /**
477 : * Pool of all Virtual IPs
478 : */
479 : lb_vip_t *vips;
480 :
481 : /**
482 : * bitmap for vip prefix to support per-port vip
483 : */
484 : uword *vip_prefix_indexes;
485 :
486 : /**
487 : * Pool of ASs.
488 : * ASs are referenced by address and vip index.
489 : * The first element (index 0) is special and used only to fill
490 : * new_flow_tables when no AS has been configured.
491 : */
492 : lb_as_t *ass;
493 :
494 : /**
495 : * Each AS has an associated reference counter.
496 : * As ass[0] has a special meaning, its associated counter
497 : * starts at 0 and is decremented instead. i.e. do not use it.
498 : */
499 : vlib_refcount_t as_refcount;
500 :
501 : /* hash lookup vip_index by key: {u16: nodeport} */
502 : uword * vip_index_by_nodeport;
503 :
504 : /**
505 : * Some global data is per-cpu
506 : */
507 : lb_per_cpu_t *per_cpu;
508 :
509 : /**
510 : * Node next index for IP adjacencies, for each of the traffic types.
511 : */
512 : u32 ip_lookup_next_index[LB_VIP_N_TYPES];
513 :
514 : /**
515 : * Source address used in IPv6 encapsulated traffic
516 : */
517 : ip6_address_t ip6_src_address;
518 :
519 : /**
520 : * Source address used for IPv4 encapsulated traffic
521 : */
522 : ip4_address_t ip4_src_address;
523 :
524 : /**
525 : * Number of buckets in the per-cpu sticky hash table.
526 : */
527 : u32 per_cpu_sticky_buckets;
528 :
529 : /**
530 : * Flow timeout in seconds.
531 : */
532 : u32 flow_timeout;
533 :
534 : /**
535 : * Per VIP counter
536 : */
537 : vlib_simple_counter_main_t vip_counters[LB_N_VIP_COUNTERS];
538 :
539 : /**
540 : * DPO used to send packet from IP4/6 lookup to LB node.
541 : */
542 : dpo_type_t dpo_gre4_type;
543 : dpo_type_t dpo_gre6_type;
544 : dpo_type_t dpo_gre4_port_type;
545 : dpo_type_t dpo_gre6_port_type;
546 : dpo_type_t dpo_l3dsr_type;
547 : dpo_type_t dpo_l3dsr_port_type;
548 : dpo_type_t dpo_nat4_port_type;
549 : dpo_type_t dpo_nat6_port_type;
550 : /**
551 : * Node type for registering to fib changes.
552 : */
553 : fib_node_type_t fib_node_type;
554 :
555 : /* lookup per_port vip by key */
556 : clib_bihash_8_8_t vip_index_per_port;
557 :
558 : /* Find a static mapping by AS IP : target_port */
559 : clib_bihash_8_8_t mapping_by_as4;
560 : clib_bihash_24_8_t mapping_by_as6;
561 :
562 : /* Static mapping pool */
563 : lb_snat_mapping_t * snat_mappings;
564 :
565 : /**
566 : * API dynamically registered base ID.
567 : */
568 : u16 msg_id_base;
569 :
570 : clib_spinlock_t writer_lock;
571 :
572 : /* convenience */
573 : vlib_main_t *vlib_main;
574 : vnet_main_t *vnet_main;
575 : } lb_main_t;
576 :
577 : /* args for different vip encap types */
578 : typedef struct {
579 : ip46_address_t prefix;
580 : u8 plen;
581 : u8 protocol;
582 : u16 port;
583 : u8 src_ip_sticky;
584 : lb_vip_type_t type;
585 : u32 new_length;
586 : lb_vip_encap_args_t encap_args;
587 : } lb_vip_add_args_t;
588 :
589 : extern lb_main_t lb_main;
590 : extern vlib_node_registration_t lb4_node;
591 : extern vlib_node_registration_t lb6_node;
592 : extern vlib_node_registration_t lb4_nodeport_node;
593 : extern vlib_node_registration_t lb6_nodeport_node;
594 : extern vlib_node_registration_t lb_nat4_in2out_node;
595 : extern vlib_node_registration_t lb_nat6_in2out_node;
596 :
597 : /**
598 : * Fix global load-balancer parameters.
599 : * @param ip4_address IPv4 source address used for encapsulated traffic
600 : * @param ip6_address IPv6 source address used for encapsulated traffic
601 : * @param sticky_buckets FIXME
602 : * @param flow_timeout FIXME
603 : * @return 0 on success. VNET_LB_ERR_XXX on error
604 : */
605 : int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
606 : u32 sticky_buckets, u32 flow_timeout);
607 :
608 : int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index);
609 :
610 : int lb_vip_del(u32 vip_index);
611 :
612 : int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol,
613 : u16 port, u32 *vip_index);
614 :
615 : #define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index))
616 :
617 : int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n);
618 : int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n, u8 flush);
619 : int lb_flush_vip_as (u32 vip_index, u32 as_index);
620 :
621 : u32 lb_hash_time_now(vlib_main_t * vm);
622 :
623 : void lb_garbage_collection();
624 :
625 : int lb_nat4_interface_add_del (u32 sw_if_index, int is_del);
626 : int lb_nat6_interface_add_del (u32 sw_if_index, int is_del);
627 :
628 : format_function_t format_lb_main;
629 :
630 : #endif /* LB_PLUGIN_LB_LB_H_ */
|