Line data Source code
1 : /*
2 : * Copyright (c) 2015 Cisco and/or its affiliates.
3 : * Licensed under the Apache License, Version 2.0 (the "License");
4 : * you may not use this file except in compliance with the License.
5 : * You may obtain a copy of the License at:
6 : *
7 : * http://www.apache.org/licenses/LICENSE-2.0
8 : *
9 : * Unless required by applicable law or agreed to in writing, software
10 : * distributed under the License is distributed on an "AS IS" BASIS,
11 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : * See the License for the specific language governing permissions and
13 : * limitations under the License.
14 : */
15 : /*
16 : * input.c: Unix file input
17 : *
18 : * Copyright (c) 2008 Eliot Dresselhaus
19 : *
20 : * Permission is hereby granted, free of charge, to any person obtaining
21 : * a copy of this software and associated documentation files (the
22 : * "Software"), to deal in the Software without restriction, including
23 : * without limitation the rights to use, copy, modify, merge, publish,
24 : * distribute, sublicense, and/or sell copies of the Software, and to
25 : * permit persons to whom the Software is furnished to do so, subject to
26 : * the following conditions:
27 : *
28 : * The above copyright notice and this permission notice shall be
29 : * included in all copies or substantial portions of the Software.
30 : *
31 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35 : * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36 : * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37 : * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 : */
39 :
40 : #include <vlib/vlib.h>
41 : #include <vlib/unix/unix.h>
42 : #include <signal.h>
43 : #include <unistd.h>
44 : #include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
45 :
46 : /* FIXME autoconf */
47 : #define HAVE_LINUX_EPOLL
48 :
49 : #ifdef HAVE_LINUX_EPOLL
50 :
51 : #include <sys/epoll.h>
52 :
53 : typedef struct
54 : {
55 : CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
56 : int epoll_fd;
57 : struct epoll_event *epoll_events;
58 : int n_epoll_fds;
59 :
60 : /* Statistics. */
61 : u64 epoll_files_ready;
62 : u64 epoll_waits;
63 : } linux_epoll_main_t;
64 :
65 : static linux_epoll_main_t *linux_epoll_mains = 0;
66 :
67 : static void
68 7667 : linux_epoll_file_update (clib_file_t * f, clib_file_update_type_t update_type)
69 : {
70 7667 : clib_file_main_t *fm = &file_main;
71 7667 : linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains,
72 : f->polling_thread_index);
73 7667 : struct epoll_event e = { 0 };
74 7667 : int op, add_del = 0;
75 :
76 7667 : e.events = EPOLLIN;
77 7667 : if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE)
78 730 : e.events |= EPOLLOUT;
79 7667 : if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED)
80 2496 : e.events |= EPOLLET;
81 7667 : e.data.u32 = f - fm->file_pool;
82 :
83 7667 : op = -1;
84 :
85 7667 : switch (update_type)
86 : {
87 3708 : case UNIX_FILE_UPDATE_ADD:
88 3708 : op = EPOLL_CTL_ADD;
89 3708 : add_del = 1;
90 3708 : break;
91 :
92 1460 : case UNIX_FILE_UPDATE_MODIFY:
93 1460 : op = EPOLL_CTL_MOD;
94 1460 : break;
95 :
96 2499 : case UNIX_FILE_UPDATE_DELETE:
97 2499 : op = EPOLL_CTL_DEL;
98 2499 : add_del = -1;
99 2499 : break;
100 :
101 0 : default:
102 0 : clib_warning ("unknown update_type %d", update_type);
103 0 : return;
104 : }
105 :
106 : /* worker threads open epoll fd only if needed */
107 7667 : if (update_type == UNIX_FILE_UPDATE_ADD && em->epoll_fd == -1)
108 : {
109 0 : em->epoll_fd = epoll_create (1);
110 0 : if (em->epoll_fd < 0)
111 : {
112 0 : clib_unix_warning ("epoll_create");
113 0 : return;
114 : }
115 0 : em->n_epoll_fds = 0;
116 : }
117 :
118 7667 : if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0)
119 : {
120 0 : clib_unix_warning ("epoll_ctl");
121 0 : return;
122 : }
123 :
124 7667 : em->n_epoll_fds += add_del;
125 :
126 7667 : if (em->n_epoll_fds == 0)
127 : {
128 0 : close (em->epoll_fd);
129 0 : em->epoll_fd = -1;
130 : }
131 : }
132 :
133 : static_always_inline uword
134 85651700 : linux_epoll_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
135 : vlib_frame_t * frame, u32 thread_index)
136 : {
137 85651700 : unix_main_t *um = &unix_main;
138 85651700 : clib_file_main_t *fm = &file_main;
139 85651700 : linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains, thread_index);
140 : struct epoll_event *e;
141 : int n_fds_ready;
142 85651700 : int is_main = (thread_index == 0);
143 :
144 : {
145 85651700 : vlib_node_main_t *nm = &vm->node_main;
146 : u32 ticks_until_expiration;
147 : f64 timeout;
148 : f64 now;
149 85651700 : int timeout_ms = 0, max_timeout_ms = 10;
150 85651700 : f64 vector_rate = vlib_last_vectors_per_main_loop (vm);
151 :
152 85651800 : if (is_main == 0)
153 169203 : now = vlib_time_now (vm);
154 :
155 : /*
156 : * If we've been asked for a fixed-sleep between main loop polls,
157 : * do so right away.
158 : */
159 85652400 : if (PREDICT_FALSE (is_main && um->poll_sleep_usec))
160 0 : {
161 : struct timespec ts, tsrem;
162 0 : timeout = 0;
163 0 : timeout_ms = 0;
164 0 : node->input_main_loops_per_call = 0;
165 0 : ts.tv_sec = 0;
166 0 : ts.tv_nsec = 1000 * um->poll_sleep_usec;
167 :
168 0 : while (nanosleep (&ts, &tsrem) < 0)
169 : {
170 0 : ts = tsrem;
171 : }
172 : }
173 : /* If we're not working very hard, decide how long to sleep */
174 85652400 : else if (is_main && vector_rate < 2 && vm->api_queue_nonempty == 0
175 85454900 : && nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
176 : {
177 84915600 : ticks_until_expiration = TW (tw_timer_first_expires_in_ticks)
178 84915600 : ((TWT (tw_timer_wheel) *) nm->timing_wheel);
179 :
180 : /* Nothing on the fast wheel, sleep 10ms */
181 84915600 : if (ticks_until_expiration == TW_SLOTS_PER_RING)
182 : {
183 0 : timeout = 10e-3;
184 0 : timeout_ms = max_timeout_ms;
185 : }
186 : else
187 : {
188 84915600 : timeout = (f64) ticks_until_expiration *1e-5;
189 84915600 : if (timeout < 1e-3)
190 83745000 : timeout_ms = 0;
191 : else
192 : {
193 1170600 : timeout_ms = timeout * 1e3;
194 : /* Must be between 1 and 10 ms. */
195 1170600 : timeout_ms = clib_max (1, timeout_ms);
196 1170600 : timeout_ms = clib_min (max_timeout_ms, timeout_ms);
197 : }
198 : }
199 84915600 : node->input_main_loops_per_call = 0;
200 : }
201 736798 : else if (is_main == 0 && vector_rate < 2 &&
202 169486 : (vlib_get_first_main ()->time_last_barrier_release + 0.5 < now) &&
203 129412 : nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
204 : {
205 128667 : timeout = 10e-3;
206 128667 : timeout_ms = max_timeout_ms;
207 128667 : node->input_main_loops_per_call = 0;
208 : }
209 : else /* busy */
210 : {
211 : /* Don't come back for a respectable number of dispatch cycles */
212 608071 : node->input_main_loops_per_call = 1024;
213 : }
214 :
215 : /* Allow any signal to wakeup our sleep. */
216 85652300 : if (is_main || em->epoll_fd != -1)
217 85482900 : {
218 : static sigset_t unblock_all_signals;
219 85482900 : n_fds_ready = epoll_pwait (em->epoll_fd,
220 : em->epoll_events,
221 85482900 : vec_len (em->epoll_events),
222 : timeout_ms, &unblock_all_signals);
223 :
224 : /* This kludge is necessary to run over absurdly old kernels */
225 85482900 : if (n_fds_ready < 0 && errno == ENOSYS)
226 : {
227 0 : n_fds_ready = epoll_wait (em->epoll_fd,
228 : em->epoll_events,
229 0 : vec_len (em->epoll_events), timeout_ms);
230 : }
231 :
232 : }
233 : else
234 : {
235 : /*
236 : * Worker thread, no epoll fd's, sleep for 100us at a time
237 : * and check for a barrier sync request
238 : */
239 169431 : if (timeout_ms)
240 : {
241 : struct timespec ts, tsrem;
242 128667 : f64 limit = now + (f64) timeout_ms * 1e-3;
243 :
244 8489020 : while (vlib_time_now (vm) < limit)
245 : {
246 : /* Sleep for 100us at a time */
247 8182460 : ts.tv_sec = 0;
248 8182460 : ts.tv_nsec = 1000 * 100;
249 :
250 8182460 : while (nanosleep (&ts, &tsrem) < 0)
251 0 : ts = tsrem;
252 8360990 : if (*vlib_worker_threads->wait_at_barrier ||
253 8361570 : nm->pending_interrupts)
254 638 : goto done;
255 : }
256 : }
257 182972 : goto done;
258 : }
259 : }
260 :
261 85482900 : if (n_fds_ready < 0)
262 : {
263 501 : if (unix_error_is_fatal (errno))
264 0 : vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait"));
265 :
266 : /* non fatal error (e.g. EINTR). */
267 501 : goto done;
268 : }
269 :
270 85482400 : em->epoll_waits += 1;
271 85482400 : em->epoll_files_ready += n_fds_ready;
272 :
273 86116600 : for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++)
274 : {
275 634215 : u32 i = e->data.u32;
276 : clib_file_t *f;
277 : clib_error_t *errors[4];
278 634215 : int n_errors = 0;
279 :
280 : /*
281 : * Under rare scenarios, epoll may still post us events for the
282 : * deleted file descriptor. We just deal with it and throw away the
283 : * events for the corresponding file descriptor.
284 : */
285 634215 : f = fm->file_pool + i;
286 634215 : if (PREDICT_FALSE (pool_is_free (fm->file_pool, f)))
287 : {
288 0 : if (e->events & EPOLLIN)
289 : {
290 0 : errors[n_errors] =
291 0 : clib_error_return (0, "epoll event EPOLLIN dropped due "
292 : "to free index %u", i);
293 0 : n_errors++;
294 : }
295 0 : if (e->events & EPOLLOUT)
296 : {
297 0 : errors[n_errors] =
298 0 : clib_error_return (0, "epoll event EPOLLOUT dropped due "
299 : "to free index %u", i);
300 0 : n_errors++;
301 : }
302 0 : if (e->events & EPOLLERR)
303 : {
304 0 : errors[n_errors] =
305 0 : clib_error_return (0, "epoll event EPOLLERR dropped due "
306 : "to free index %u", i);
307 0 : n_errors++;
308 : }
309 : }
310 634215 : else if (PREDICT_TRUE (!(e->events & EPOLLERR)))
311 : {
312 634215 : if (e->events & EPOLLIN)
313 : {
314 633370 : f->read_events++;
315 633370 : errors[n_errors] = f->read_function (f);
316 : /* Make sure f is valid if the file pool moves */
317 633370 : if (pool_is_free_index (fm->file_pool, i))
318 14 : continue;
319 633356 : f = pool_elt_at_index (fm->file_pool, i);
320 633356 : n_errors += errors[n_errors] != 0;
321 : }
322 634201 : if (e->events & EPOLLOUT)
323 : {
324 845 : f->write_events++;
325 845 : errors[n_errors] = f->write_function (f);
326 845 : n_errors += errors[n_errors] != 0;
327 : }
328 : }
329 : else
330 : {
331 0 : if (f->error_function)
332 : {
333 0 : f->error_events++;
334 0 : errors[n_errors] = f->error_function (f);
335 0 : n_errors += errors[n_errors] != 0;
336 : }
337 : else
338 0 : close (f->file_descriptor);
339 : }
340 :
341 634201 : ASSERT (n_errors < ARRAY_LEN (errors));
342 634201 : for (i = 0; i < n_errors; i++)
343 : {
344 0 : unix_save_error (um, errors[i]);
345 : }
346 : }
347 :
348 85482400 : done:
349 85666500 : if (PREDICT_FALSE (vm->cpu_id != clib_get_current_cpu_id ()))
350 : {
351 0 : vm->cpu_id = clib_get_current_cpu_id ();
352 0 : vm->numa_node = clib_get_current_numa_node ();
353 : }
354 :
355 85653400 : return 0;
356 : }
357 :
358 : static uword
359 85651700 : linux_epoll_input (vlib_main_t * vm,
360 : vlib_node_runtime_t * node, vlib_frame_t * frame)
361 : {
362 85651700 : u32 thread_index = vlib_get_thread_index ();
363 :
364 85651700 : if (thread_index == 0)
365 85482900 : return linux_epoll_input_inline (vm, node, frame, 0);
366 : else
367 168820 : return linux_epoll_input_inline (vm, node, frame, thread_index);
368 : }
369 :
370 : /* *INDENT-OFF* */
371 183788 : VLIB_REGISTER_NODE (linux_epoll_input_node,static) = {
372 : .function = linux_epoll_input,
373 : .type = VLIB_NODE_TYPE_PRE_INPUT,
374 : .name = "unix-epoll-input",
375 : };
376 : /* *INDENT-ON* */
377 :
378 : clib_error_t *
379 575 : linux_epoll_input_init (vlib_main_t * vm)
380 : {
381 : linux_epoll_main_t *em;
382 575 : clib_file_main_t *fm = &file_main;
383 575 : vlib_thread_main_t *tm = vlib_get_thread_main ();
384 :
385 :
386 575 : vec_validate_aligned (linux_epoll_mains, tm->n_vlib_mains,
387 : CLIB_CACHE_LINE_BYTES);
388 :
389 1780 : vec_foreach (em, linux_epoll_mains)
390 : {
391 : /* Allocate some events. */
392 1205 : vec_resize (em->epoll_events, VLIB_FRAME_SIZE);
393 :
394 1205 : if (linux_epoll_mains == em)
395 : {
396 575 : em->epoll_fd = epoll_create (1);
397 575 : if (em->epoll_fd < 0)
398 0 : return clib_error_return_unix (0, "epoll_create");
399 : }
400 : else
401 630 : em->epoll_fd = -1;
402 : }
403 :
404 575 : fm->file_update = linux_epoll_file_update;
405 :
406 575 : return 0;
407 : }
408 :
409 4031 : VLIB_INIT_FUNCTION (linux_epoll_input_init);
410 :
411 : #endif /* HAVE_LINUX_EPOLL */
412 :
413 : static clib_error_t *
414 575 : unix_input_init (vlib_main_t * vm)
415 : {
416 575 : return 0;
417 : }
418 :
419 : /* *INDENT-OFF* */
420 2303 : VLIB_INIT_FUNCTION (unix_input_init) =
421 : {
422 : .runs_before = VLIB_INITS ("linux_epoll_input_init"),
423 : };
424 : /* *INDENT-ON* */
425 :
426 : /*
427 : * fd.io coding-style-patch-verification: ON
428 : *
429 : * Local Variables:
430 : * eval: (c-set-style "gnu")
431 : * End:
432 : */
|