From a34c14821b268ac663f4652cf1911448053895e8 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 12 Jun 2025 23:12:13 +0530 Subject: [PATCH 01/43] feat: new pool API and error handling - Added a new flash_pool API for memory management, it enables us to transmit packets even without requiring any packet reception. - Shifted the stats thread function to periodically dump statistics for each socket to library. NFs may write their own function using our advanced API. - Refactored existing functions in flash_nf and flash_txrx to handle error and integrate the new pool API. - Updated command line argument parsing to include new options and improve usability. - Improved error handling and logging throughout the codebase. - The l2fwd application is modified to support the new pool API, all other NFs may not work. They will be updated in future commits. The old API will be retired subsequently. - Introduced new version number 25.5.0 following year.month.version paradigm. --- examples/arpresolver/main.c | 10 +- examples/firewall/main.c | 14 +- examples/ip4ping/main.c | 10 +- examples/l2fwd/main.c | 197 +++++++++++--------- examples/maglev/main.c | 10 +- examples/mica/main.c | 10 +- examples/simple-firewall/main.c | 10 +- examples/simplefwd/main.c | 8 +- examples/unit-tests/backpressure.c | 8 +- examples/unit-tests/correctness.c | 14 +- examples/unit-tests/fwddrop.c | 10 +- examples/unit-tests/fwdrr.c | 8 +- examples/unit-tests/userspace-chain.c | 18 +- lib/flash/log/log.c | 6 +- lib/flash/meson.build | 2 +- lib/flash/nf/flash_nf.c | 170 +++++++++++++---- lib/flash/nf/flash_nf.h | 120 +++++++++++- lib/flash/nf/flash_stats.c | 27 +++ lib/flash/nf/flash_txrx.c | 257 +++++++++++++++++++++++++- lib/flash/nf/meson.build | 2 +- lib/flash/params/flash_params.c | 102 ++++++---- lib/flash/params/flash_params.h | 11 +- lib/flash/pool/flash_pool.c | 39 ++++ lib/flash/pool/flash_pool.h | 38 ++++ lib/flash/pool/meson.build | 12 ++ lib/include/flash_defines.h | 4 + meson.build | 2 +- 27 files changed, 876 insertions(+), 243 deletions(-) create mode 100644 lib/flash/pool/flash_pool.c create mode 100644 lib/flash/pool/flash_pool.h create mode 100644 lib/flash/pool/meson.build diff --git a/examples/arpresolver/main.c b/examples/arpresolver/main.c index 9e9f751..285f439 100644 --- a/examples/arpresolver/main.c +++ b/examples/arpresolver/main.c @@ -254,11 +254,11 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; unsigned int tot_pkt_drop = 0; @@ -323,8 +323,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -401,7 +401,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/firewall/main.c b/examples/firewall/main.c index 601603a..c9f6d1f 100644 --- a/examples/firewall/main.c +++ b/examples/firewall/main.c @@ -148,11 +148,11 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; unsigned int tot_pkt_drop = 0; @@ -222,7 +222,7 @@ static void *socket_routine(void *arg) sid.dport = *dport; // Find murmurhash of sid - uint32_t sid_hash = murmurhash((void*)&sid, sizeof(struct session_id), 0); + uint32_t sid_hash = murmurhash((void *)&sid, sizeof(struct session_id), 0); bool invalid = false; for (int i = 0; i < NUM_INVALID_SESSIONS; i++) { if (invalid_sessions[i] == sid_hash) { @@ -231,13 +231,13 @@ static void *socket_routine(void *arg) break; } } - if (! invalid) + if (!invalid) send[tot_pkt_send++] = &msg.msg_iov[i]; } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -314,7 +314,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/ip4ping/main.c b/examples/ip4ping/main.c index cdc02a1..413ed86 100644 --- a/examples/ip4ping/main.c +++ b/examples/ip4ping/main.c @@ -163,12 +163,12 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); unsigned int tot_pkt_drop = 0; unsigned int tot_pkt_send = 0; @@ -224,8 +224,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -324,7 +324,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index 4f517a2..69de38e 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -2,6 +2,7 @@ * Copyright (c) 2025 Debojeet Das * * l2fwd: A simple NF that forwards packets between two interfaces + * after swapping or modifying MAC addresses. */ #include @@ -16,11 +17,11 @@ bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; static void int_exit(int sig) { - log_info("Received Signal: %d", sig); + log_debug("Received Signal: %d", sig); done = true; } @@ -55,12 +56,21 @@ static uint8_t *get_mac_addr(char *mac_addr) return dest_ether_addr_octet; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +// clang-format off +static const char *l2fwd_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tEnable SR-IOV mode and set dest MAC address", + NULL +}; +// clang-format on + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; - // Default values app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; @@ -69,8 +79,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -85,8 +98,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->sriov = true; break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + + return 0; } static void update_dest_mac(void *data) @@ -118,7 +134,7 @@ static void swap_mac_addresses(void *data) *dst_addr = tmp; } -struct Args { +struct sock_args { int socket_id; int *next; int next_size; @@ -126,149 +142,154 @@ struct Args { static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - log_info("SOCKET_ID: %d", socket_id); - static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct xskvec *xskvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, nb_frags = 0; + struct sock_args *a = (struct sock_args *)arg; - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; - fds[0].fd = nf->thread[socket_id]->socket->fd; - fds[0].events = POLLIN; + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate send array"); + return NULL; + } - nf->thread[socket_id]->socket->idle_fd.fd = nf->thread[socket_id]->socket->fd; - nf->thread[socket_id]->socket->idle_fd.events = POLLIN; + fds[0].fd = xsk->fd; + fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - bool eop = IS_EOP_DESC(xv->options); - - char *pkt = xv->data; + char *pkt = xskvecs[i].data; if (!nb_frags++) app_conf.sriov ? update_dest_mac(pkt) : swap_mac_addresses(pkt); - send[tot_pkt_send++] = &msg.msg_iov[i]; - if (eop) + if (IS_EOP_DESC(xskvecs[i].options)) nb_frags = 0; } if (nrecv) { - ret = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - if (ret != nrecv) { - log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + nsend = flash__sendmsg(cfg, xsk, xskvecs, nrecv); + if (nsend != nrecv) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + break; } } if (done) break; } - free(msg.msg_iov); - return NULL; -} - -static void *worker__stats(void *arg) -{ - (void)arg; - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } + free(xskvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { - log_error("ERROR: Memory allocation failed\n"); + log_error("ERROR: Memory allocation failed"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "L2 Forwarding Application"; + cfg->app_options = l2fwd_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; - log_info("Control Plane Setup Done"); + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); - log_info("STARTING Data Path"); + log_info("Starting Data Path..."); + + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args[i].socket_id = i; + // args[i].next = nf->next; + // args[i].next_size = nf->next_size; - log_info("2_NEXT_SIZE: %d", args->next_size); + // log_debug("Next Size ::: %d", args[i].next_size); - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + // for (int i = 0; i < args[i].next_size; i++) + // log_debug("Next Item [%d] ::: %d", i, nf->next[i]); - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); - wait_for_cmd(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + free(args); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/examples/maglev/main.c b/examples/maglev/main.c index 7777af1..87629b0 100644 --- a/examples/maglev/main.c +++ b/examples/maglev/main.c @@ -314,11 +314,11 @@ static void *socket_routine(void *arg) hashmap_init(&active_sessions, sizeof(struct session_id), sizeof(struct replace_info), MAX_SESSIONS); for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; unsigned int tot_pkt_drop = 0; @@ -491,8 +491,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); @@ -573,7 +573,7 @@ int main(int argc, char **argv) pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/mica/main.c b/examples/mica/main.c index 4e8a71e..e8f2f1f 100644 --- a/examples/mica/main.c +++ b/examples/mica/main.c @@ -296,11 +296,11 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; unsigned int tot_pkt_drop = 0; @@ -400,8 +400,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -472,7 +472,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/simple-firewall/main.c b/examples/simple-firewall/main.c index d4e57c1..7efbcf1 100644 --- a/examples/simple-firewall/main.c +++ b/examples/simple-firewall/main.c @@ -236,11 +236,11 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; unsigned int tot_pkt_drop = 0; @@ -317,8 +317,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -395,7 +395,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index d01d45e..30799e7 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -88,12 +88,12 @@ static void *socket_routine(void *arg) for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *send[nrecv]; unsigned int tot_pkt_send = 0; for (i = 0; i < nrecv; i++) { @@ -111,7 +111,7 @@ static void *socket_routine(void *arg) } if (nrecv) { - ret = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); if (ret != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -210,7 +210,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/unit-tests/backpressure.c b/examples/unit-tests/backpressure.c index b23546f..8e56341 100644 --- a/examples/unit-tests/backpressure.c +++ b/examples/unit-tests/backpressure.c @@ -342,12 +342,12 @@ static void *socket_routine(void *arg) unsigned int count = 0; for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret != 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *send[nrecv]; unsigned int tot_pkt_send = 0; @@ -379,7 +379,7 @@ static void *socket_routine(void *arg) } if (nrecv) { - ret = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); if (ret != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -487,7 +487,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/unit-tests/correctness.c b/examples/unit-tests/correctness.c index d649176..08727d8 100644 --- a/examples/unit-tests/correctness.c +++ b/examples/unit-tests/correctness.c @@ -339,16 +339,16 @@ static void *socket_routine(void *arg) unsigned int count = 0; for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret != 1) continue; } - // ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + // ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); // if (ret <= 0 || ret > 1) // continue; - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); // if (nrecv == 0) { // uint64_t tstamp = rdtsc(); @@ -361,9 +361,9 @@ static void *socket_routine(void *arg) // if (idle_timestamp && (tstamp > idle_timestamp)) { // idle_timestamp = 0; - // ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + // ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); // if (ret) - // nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + // nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); // else // continue; // } @@ -391,7 +391,7 @@ static void *socket_routine(void *arg) } if (nrecv) { - ret = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); if (ret != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -504,7 +504,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index e514bc7..070fd48 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -94,12 +94,12 @@ static void *socket_routine(void *arg) fds[0].events = POLLIN; for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *drop[nrecv]; struct xskvec *send[nrecv]; @@ -128,8 +128,8 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__dropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); + size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -228,7 +228,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index a0b7072..4d61571 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -150,12 +150,12 @@ static void *socket_routine(void *arg) unsigned int count = 0; for (;;) { if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__poll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); + ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); if (ret <= 0 || ret > 1) continue; } - nrecv = flash__recvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); struct xskvec *send[nrecv]; unsigned int tot_pkt_send = 0; for (i = 0; i < nrecv; i++) { @@ -177,7 +177,7 @@ static void *socket_routine(void *arg) } if (nrecv) { - ret = flash__sendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); + ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); if (ret != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); @@ -276,7 +276,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/examples/unit-tests/userspace-chain.c b/examples/unit-tests/userspace-chain.c index 398fcea..317ed65 100644 --- a/examples/unit-tests/userspace-chain.c +++ b/examples/unit-tests/userspace-chain.c @@ -22,8 +22,8 @@ struct nf *nf; #define FLASH_MAX_SOCKETS 8 ///////////// owner ring buffer ///////////// -#define struct_size(p, member, count) \ - ({ \ +#define struct_size(p, member, count) \ + ({ \ size_t __size = sizeof(*(p)) + (count) * sizeof((p)->member[0]); \ (__size < sizeof(*(p))) ? SIZE_MAX : __size; \ }) @@ -76,8 +76,8 @@ struct guest_queue *guest_queues[FLASH_MAX_SOCKETS][FLASH_MAX_SOCKETS]; ///////////// guest ring buffer operations ///////////// -#define guest_cpu_relax() \ - do { \ +#define guest_cpu_relax() \ + do { \ asm volatile("pause\n" : : : "memory"); \ } while (0) @@ -477,10 +477,10 @@ static void *socket_routine(void *arg) } else { ret = guest_bulk_enqueue_rxtx(guest_queues[socket_id][socket_id + 1], descs, tot_pkt_send); - #ifdef STATS - nf->thread[socket_id]->socket->ring_stats.tx_npkts += ret; - nf->thread[socket_id]->socket->ring_stats.tx_frags += ret; - #endif +#ifdef STATS + nf->thread[socket_id]->socket->ring_stats.tx_npkts += ret; + nf->thread[socket_id]->socket->ring_stats.tx_frags += ret; +#endif } if (ret != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); @@ -589,7 +589,7 @@ int main(int argc, char **argv) } pthread_detach(stats_thread); - wait_for_cmd(cfg); + flash__wait(cfg); flash__xsk_close(cfg, nf); diff --git a/lib/flash/log/log.c b/lib/flash/log/log.c index 188dab9..59d6ec7 100644 --- a/lib/flash/log/log.c +++ b/lib/flash/log/log.c @@ -52,10 +52,10 @@ static void stdout_callback(log_Event *ev) char buf[16]; buf[strftime(buf, sizeof(buf), "%H:%M:%S", ev->time)] = '\0'; #ifdef LOG_USE_COLOR - fprintf(ev->udata, "%s %s%-5s\x1b[0m \x1b[90m%s:%d:\x1b[0m [%s()]\t", buf, level_colors[ev->level], level_strings[ev->level], + fprintf(ev->udata, "%s %s%-5s\x1b[0m \x1b[90m%s:%d:\x1b[0m [%s()] ", buf, level_colors[ev->level], level_strings[ev->level], ev->file + 3, ev->line, ev->caller); #else - fprintf(ev->udata, "%s %-5s %s:%d: [%s()]\t", buf, level_strings[ev->level], ev->file, ev->line, ev->caller); + fprintf(ev->udata, "%s %-5s %s:%d: [%s()] ", buf, level_strings[ev->level], ev->file, ev->line, ev->caller); #endif vfprintf(ev->udata, ev->fmt, ev->ap); fprintf(ev->udata, "\n"); @@ -66,7 +66,7 @@ static void file_callback(log_Event *ev) { char buf[64]; buf[strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", ev->time)] = '\0'; - fprintf(ev->udata, "%s %-5s %s:%d: [%s()]\t", buf, level_strings[ev->level], ev->file, ev->line, ev->caller); + fprintf(ev->udata, "%s %-5s %s:%d: [%s()] ", buf, level_strings[ev->level], ev->file, ev->line, ev->caller); vfprintf(ev->udata, ev->fmt, ev->ap); fprintf(ev->udata, "\n"); fflush(ev->udata); diff --git a/lib/flash/meson.build b/lib/flash/meson.build index 83ef3ed..df73063 100644 --- a/lib/flash/meson.build +++ b/lib/flash/meson.build @@ -5,7 +5,7 @@ special_dirs = [ 'log', ] -dirs = ['uds', 'common', 'monitor', 'params', 'nf'] +dirs = ['uds', 'common', 'monitor', 'params', 'pool', 'nf'] foreach special_dir : special_dirs sources = [] diff --git a/lib/flash/nf/flash_nf.c b/lib/flash/nf/flash_nf.c index 6162422..ce4cc90 100644 --- a/lib/flash/nf/flash_nf.c +++ b/lib/flash/nf/flash_nf.c @@ -9,33 +9,36 @@ #include #include +#include #include #include "flash_nf.h" bool done; -int set_nonblocking(int sockfd) +static int set_nonblocking(int sockfd) { int flags = fcntl(sockfd, F_GETFL, 0); if (flags == -1) { - perror("fcntl F_GETFL"); + log_error("fcntl F_GETFL"); return -1; } flags |= O_NONBLOCK; // Add the O_NONBLOCK flag if (fcntl(sockfd, F_SETFL, flags) == -1) { - perror("fcntl F_SETFL"); + log_error("fcntl F_SETFL"); return -1; } return 0; } -void wait_for_cmd(struct config *cfg) +void flash__wait(struct config *cfg) { int cmd; - set_nonblocking(cfg->uds_sockfd); + + if (set_nonblocking(cfg->uds_sockfd) < 0) + log_warn("Failed to set UDS socket to non-blocking mode"); while (!done) { int bytes_received = read(cfg->uds_sockfd, &cmd, sizeof(int)); @@ -96,7 +99,7 @@ static int *__configure(struct config *cfg, struct nf *nf) send_cmd(uds_sockfd, FLASH__CREATE_SOCKET); recv_fd(uds_sockfd, received_fd + i); recv_data(uds_sockfd, &cfg->ifqueue[i], sizeof(int)); - log_info("RECEIVED SOCKET-%d FD-%d, binded to Queue-%d", i, received_fd[i], cfg->ifqueue[i]); + log_info("RECEIVED SOCKET-%d FD-%d, bound to Queue-%d", i, received_fd[i], cfg->ifqueue[i]); } send_cmd(uds_sockfd, FLASH__GET_ROUTE_INFO); @@ -261,19 +264,59 @@ void flash__populate_fill_ring(struct thread **thread, int frame_size, int total } } -void flash__xsk_close(struct config *cfg, struct nf *nf) +static int __populate_fill_ring(struct thread *thread, bool full, int umem_scale) { - log_info("Shutting down..."); + int ret, i; + int nr_frames; + uint32_t idx = 0; + uint64_t fill_addr; - close_uds_conn(cfg); + if (full) + nr_frames = (size_t)XSK_RING_PROD__DEFAULT_NUM_DESCS * (size_t)2 * (size_t)umem_scale; + else + nr_frames = (size_t)XSK_RING_PROD__DEFAULT_NUM_DESCS * (size_t)umem_scale; - size_t desc_sz = sizeof(struct xdp_desc); + ret = xsk_ring_prod__reserve(&thread->socket->fill, nr_frames, &idx); + if (ret != nr_frames) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + return -1; + } + + for (i = 0; i < nr_frames; i++) { + if (!flash_pool__get(thread->socket->flash_pool, &fill_addr)) { + log_error("ERROR: Unable to get frame from flash pool"); + return -1; + } + + *xsk_ring_prod__fill_addr(&thread->socket->fill, idx++) = fill_addr; + } + + xsk_ring_prod__submit(&thread->socket->fill, nr_frames); + + return 0; +} + +void flash__xsk_close(struct config *cfg, struct nf *nf) +{ struct xdp_mmap_offsets off; + size_t desc_sz = sizeof(struct xdp_desc); int err; - if (!nf) + log_debug("Shutting down..."); + + if (!cfg || !nf) return; + + // Corner case handling to be done properly + close_uds_conn(cfg); + for (int i = 0; i < cfg->total_sockets; i++) { + if (!nf->thread[i] && !nf->thread[i]->socket) + return; + + if (nf->thread[i]->socket->flash_pool) + flash_pool__destroy(nf->thread[i]->socket->flash_pool); + err = xsk_get_mmap_offsets(nf->thread[i]->socket->fd, &off); if (!err) { munmap(nf->thread[i]->socket->rx.ring - off.rx.desc, off.rx.desc + cfg->xsk_config->rx_size * desc_sz); @@ -283,20 +326,28 @@ void flash__xsk_close(struct config *cfg, struct nf *nf) munmap(nf->thread[i]->socket->comp.ring - off.cr.desc, off.cr.desc + cfg->umem_config->comp_size * sizeof(uint64_t)); } + free(nf->thread[i]->socket); free(nf->thread[i]); } + free(nf->thread); + free(nf->next); free(nf); - if (cfg->umem->buffer) { - munmap(cfg->umem->buffer, NUM_FRAMES * cfg->umem->frame_size * cfg->total_sockets); - } + if (cfg->umem) { + if (cfg->umem->buffer) + munmap(cfg->umem->buffer, NUM_FRAMES * cfg->umem->frame_size * cfg->total_sockets); - if (cfg && cfg->umem && cfg->xsk) { free(cfg->umem); + } + + if (cfg->xsk) free(cfg->xsk); - free(cfg); + + if (cfg->umem_config && cfg->xsk_config) { + free(cfg->umem_config); + free(cfg->xsk_config); } } @@ -307,45 +358,92 @@ static bool xsk_page_aligned(void *buffer) return !(addr & (getpagesize() - 1)); } -void flash__configure_nf(struct nf **_nf, struct config *cfg) +int flash__configure_nf(struct nf **_nf, struct config *cfg) { - struct nf *nf = (struct nf *)calloc(1, sizeof(struct nf)); - int *sockfd = __configure(cfg, nf); + int i, size; + int *sockfd; + struct nf *nf; - if (cfg->total_sockets <= 0) - log_error("Invalid number of sockets"); - nf->thread = (struct thread **)calloc(cfg->total_sockets, sizeof(struct thread *)); + nf = (struct nf *)calloc(1, sizeof(struct nf)); + if (!nf) { + log_error("ERROR: Memory allocation failed for nf"); + return -1; + } - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i] = (struct thread *)calloc(1, sizeof(struct thread)); + // monitor communication happens here - overhaul required + // corner case handling etc. + // routing should be handled better + sockfd = __configure(cfg, nf); - int size = cfg->umem->size; + size = cfg->umem->size; cfg->umem->buffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, cfg->umem_fd, 0); if (cfg->umem->buffer == MAP_FAILED) { - log_error("ERROR: (UMEM setup) mmap failed \"%s\"\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: (UMEM setup) mmap failed \"%s\"", strerror(errno)); + goto out_error; } if (!size && !xsk_page_aligned(cfg->umem->buffer)) { - log_error("ERROR: UMEM size is not page aligned \"%s\"\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: UMEM size is not page aligned \"%s\"", strerror(errno)); + goto out_error; } - setup_xsk_config(&cfg->xsk_config, &cfg->umem_config, cfg); + if (cfg->total_sockets <= 0) { + log_error("Invalid number of sockets"); + goto out_error; + } - for (int i = 0; i < cfg->total_sockets; i++) { - log_info("SOCKET FD (Thread %d) :::: %d\n", i, sockfd[i]); + nf->thread = (struct thread **)calloc(cfg->total_sockets, sizeof(struct thread *)); + if (!nf->thread) { + log_error("ERROR: Memory allocation failed for threads"); + goto out_error; } - for (int i = 0; i < cfg->total_sockets; i++) { + setup_xsk_config(&cfg->xsk_config, &cfg->umem_config, cfg); + + for (i = 0; i < cfg->total_sockets; i++) { + log_debug("Thread %d: socket fd ::: %d", i, sockfd[i]); + nf->thread[i] = (struct thread *)calloc(1, sizeof(struct thread)); + if (!nf->thread[i]) { + log_error("ERROR: Memory allocation failed for thread %d", i); + goto out_error; + } + nf->thread[i]->socket = (struct socket *)calloc(1, sizeof(struct socket)); + if (!nf->thread[i]->socket) { + log_error("ERROR: Memory allocation failed for socket %d", i); + goto out_error; + } + + nf->thread[i]->socket->flash_pool = flash_pool__create(cfg->umem->frame_size, cfg->umem_offset + i, cfg->umem_scale); + if (!nf->thread[i]->socket->flash_pool) { + log_error("ERROR: (Flash Pool setup) flash_pool__create failed \"%s\"", strerror(errno)); + goto out_error; + } + nf->thread[i]->socket->fd = sockfd[i]; nf->thread[i]->socket->ifqueue = cfg->ifqueue[i]; - if (xsk_mmap_umem_rings(nf->thread[i]->socket, *cfg->umem_config, *cfg->xsk_config) != 0) { - log_error("ERROR: (Ring setup) mmap failed \"%s\"\n", strerror(errno)); - exit(EXIT_FAILURE); + nf->thread[i]->socket->idle_fd.fd = sockfd[i]; + nf->thread[i]->socket->idle_fd.events = POLLIN; + + if (xsk_mmap_umem_rings(nf->thread[i]->socket, *cfg->umem_config, *cfg->xsk_config) < 0) { + log_error("ERROR: (Ring setup) mmap failed \"%s\"", strerror(errno)); + goto out_error; + } + + if (__populate_fill_ring(nf->thread[i], cfg->rx_first, cfg->umem_scale) < 0) { + log_error("ERROR: (Fill ring setup) __populate_fill_ring failed \"%s\"", strerror(errno)); + goto out_error; } } + + // Is this handling correct?? + free(cfg->ifqueue); free(sockfd); *_nf = nf; + return 0; + +out_error: + free(sockfd); + flash__xsk_close(cfg, nf); + return -1; } diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index 845b246..ee0f1bb 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -23,22 +23,124 @@ struct xskmsghdr { uint32_t msg_len; /* Number of vectors */ }; +struct stats_conf { + struct nf *nf; + struct config *cfg; +}; + extern bool done; -void flash__populate_fill_ring(struct thread **thread, int frame_size, int total_sockets, int umem_offset, int umem_scale); -void flash__configure_nf(struct nf **_nf, struct config *cfg); +/* Control Path API */ + +/** + * Configure the NF with the provided configuration. + * Communicates with the monitor to set up the NF. + * + * @param nf: Pointer to the nf structure pointer to be configured. + * @param cfg: Pointer to the configuration structure. + * + * This function sets up the NF with the provided configuration, including + * memory mapping, socket setup, and thread initialization. + * Therefore nf is a pointer to a pointer, where as cfg is a pointer to the configuration structure. + * + * @return 0 on success, or -1 on failure. + */ +int flash__configure_nf(struct nf **_nf, struct config *cfg); + +/** + * Wait for a signal from the server to indicate that server wants to close the nf. + * This function sets the UDS socket to non-blocking mode and checks + * for incoming signals until it receives one or the connection is closed. + * + * @param cfg Pointer to the configuration structure. + */ +void flash__wait(struct config *cfg); + +/** + * Close the NF and clean up resources. + * This function unmaps memory regions, frees allocated structures, + * and closes file descriptors associated with the NF. + * + * @param cfg: Pointer to the configuration structure. + * @param nf: Pointer to the nf structure to be closed. + */ void flash__xsk_close(struct config *cfg, struct nf *nf); -int flash__poll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout); -size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg); -size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend); -size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop); + +/* Data Path API */ + +/** + * Poll the NF for incoming packets. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param fds: Pointer to the array of pollfd structures. + * @param nfds: Number of file descriptors to poll. + * + * @return Number of file descriptors that are ready for reading, or -2 if polling is not enabled. + * 0 if timeout occurs or no file descriptors are ready. -1 on error. + */ +int flash__poll(struct config *cfg, struct socket *xsk, struct pollfd *fds, nfds_t nfds); + +/** + * Receive messages from the socket. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param xskvecs: Pointer to the array of xskvec structures to receive data into. + * @param nrecv: Number of messages to receive. + * + * @return Number of messages received, or 0 if no messages are available. + */ +size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nrecv); + +/** + * Send messages through the socket. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param xskvecs: Pointer to the array of xskvec structures containing data to send. + * @param nsend: Number of messages to send. + * + * @return Number of messages sent, or 0 if no messages were sent. + */ +size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nsend); + +int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout); +size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg); +size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend); +size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop); + +/** + * Thread function to periodically dump statistics of the NF. + * + * @param conf: Pointer to the stats_conf structure containing NF and config. + * + * This routine should be invoked via threads, and it will periodically clear the terminal + * and dump statistics for each socket in the NF. + */ +void *flash__stats_thread(void *conf); + +/* Advanced API */ + +void flash__populate_fill_ring(struct thread **thread, int frame_size, int total_sockets, int umem_offset, int umem_scale); + +/** + * Get the current time in nanoseconds. + * @param cfg: Pointer to the configuration structure. + * + * Returns the current time in nanoseconds since the epoch. + */ unsigned long flash__get_nsecs(struct config *cfg); + +/** + * Dump statistics for the given socket. + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + */ void flash__dump_stats(struct config *cfg, struct socket *xsk); -void wait_for_cmd(struct config *cfg); -int set_nonblocking(int sockfd); /* Experimental */ size_t flash__sendmsg_us(struct config *cfg, struct socket *xsk, struct socket *xsk_first, struct xskvec **msgiov, uint32_t nsend); size_t flash__recvmsg_us(struct config *cfg, struct socket *xsk, struct socket *xsk_first, struct xskmsghdr *msg); size_t flash__dropmsg_us(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop); -#endif /* __FLASH_NF_H */ \ No newline at end of file +#endif /* __FLASH_NF_H */ diff --git a/lib/flash/nf/flash_stats.c b/lib/flash/nf/flash_stats.c index a836083..8ead395 100644 --- a/lib/flash/nf/flash_stats.c +++ b/lib/flash/nf/flash_stats.c @@ -4,6 +4,8 @@ #include #include +#include +#include #include "flash_nf.h" @@ -220,3 +222,28 @@ void flash__dump_stats(struct config *cfg, struct socket *xsk) __dump_driver_stats(cfg, xsk, diff); } } + +void *flash__stats_thread(void *conf) +{ + struct stats_conf *arg = (struct stats_conf *)conf; + struct nf *nf = arg->nf; + struct config *cfg = arg->cfg; + + if (cfg->verbose) { + unsigned int interval = cfg->stats_interval; + setlocale(LC_ALL, ""); + + for (int i = 0; i < cfg->total_sockets; i++) + nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); + + while (!done) { + sleep(interval); + if (system("clear") != 0) + log_error("Terminal clear error"); + for (int i = 0; i < cfg->total_sockets; i++) { + flash__dump_stats(cfg, nf->thread[i]->socket); + } + } + } + return NULL; +} diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 7ca7d85..fb4bcd7 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -6,6 +6,8 @@ #include #include +#include + #include "flash_nf.h" static uint64_t __hz; @@ -88,7 +90,7 @@ static uint64_t get_timer_hz(struct config *cfg) return __hz; } -int flash__poll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout) +int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout) { #ifdef STATS xsk->app_stats.opt_polls++; @@ -96,6 +98,17 @@ int flash__poll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout return poll(fds, nfds, timeout); } +int flash__poll(struct config *cfg, struct socket *xsk, struct pollfd *fds, nfds_t nfds) +{ + if (!(cfg->xsk->mode & FLASH__POLL)) + return -2; + +#ifdef STATS + xsk->app_stats.opt_polls++; +#endif + return poll(fds, nfds, cfg->xsk->poll_timeout); +} + static void __kick_tx(struct socket *xsk) { int ret; @@ -155,6 +168,62 @@ static inline void __complete_tx_rx_first(struct config *cfg, struct socket *xsk } } +static inline void __complete_tx_completions(struct config *cfg, struct socket *xsk) +{ + uint32_t idx_cq = 0, idx_fq = 0; + uint32_t completed, num_outstanding, i, ret; + uint64_t addr; + + if (!xsk->outstanding_tx) + return; + + /** + * In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to + * really send the packets. In zero-copy mode we do not have to do this, since Tx + * is driven by the NAPI loop. So as an optimization, we do not have to call + * sendto() all the time in zero-copy mode. + */ + if (cfg->xsk->bind_flags & XDP_COPY) { +#ifdef STATS + xsk->app_stats.copy_tx_sendtos++; +#endif + __kick_tx(xsk); + } + + num_outstanding = xsk->outstanding_tx > cfg->xsk->batch_size ? cfg->xsk->batch_size : xsk->outstanding_tx; + + /* Re-add completed TX buffers */ + completed = xsk_ring_cons__peek(&xsk->comp, num_outstanding, &idx_cq); + if (!completed) + return; + + if (cfg->rx_first) { + ret = xsk_ring_prod__reserve(&xsk->fill, completed, &idx_fq); + while (ret != completed) { + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->fill)) { +#ifdef STATS + xsk->app_stats.fill_fail_polls++; +#endif + recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); + } + ret = xsk_ring_prod__reserve(&xsk->fill, completed, &idx_fq); + } + + for (i = 0; i < completed; i++) + *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); + + xsk_ring_prod__submit(&xsk->fill, completed); + } else { + for (i = 0; i < completed; i++) { + addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); + flash_pool__put(xsk->flash_pool, addr); + } + } + + xsk_ring_cons__release(&xsk->comp, completed); + xsk->outstanding_tx -= completed; +} + static inline uint32_t __reserve_fq(struct config *cfg, struct socket *xsk, uint32_t num) { uint32_t idx_fq = 0; @@ -173,7 +242,7 @@ static inline uint32_t __reserve_fq(struct config *cfg, struct socket *xsk, uint return idx_fq; } -static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) +static inline uint32_t __old_reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) { uint32_t idx_tx = 0; uint32_t ret; @@ -199,6 +268,32 @@ static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint return idx_tx; } +static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) +{ + uint32_t idx_tx = 0; + uint32_t ret; + + ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); + while (ret != num) { + __complete_tx_completions(cfg, xsk); + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { +#ifdef STATS + xsk->app_stats.tx_wakeup_sendtos++; +#endif + __kick_tx(xsk); + } + ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); + + if (cfg->smart_poll && ret != num && xsk->outstanding_tx >= cfg->xsk->bp_thres) { + usleep(cfg->xsk->bp_timeout); +#ifdef STATS + xsk->app_stats.backpressure++; +#endif + } + } + return idx_tx; +} + static void __hex_dump(void *pkt, size_t length, uint64_t addr) { const unsigned char *address = (unsigned char *)pkt; @@ -234,7 +329,7 @@ static void __hex_dump(void *pkt, size_t length, uint64_t addr) printf("\n"); } -size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg) +size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg) { int ret; uint32_t idx_rx = 0; @@ -247,7 +342,7 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr * __complete_tx_rx_first(cfg, xsk); if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { - ret = flash__poll(xsk, &xsk->idle_fd, 1, cfg->xsk->poll_timeout); + ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, cfg->xsk->poll_timeout); if (ret <= 0) { xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); return 0; @@ -305,7 +400,110 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr * return rcvd; } -size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend) +static inline void __replenish_fill_ring(struct config *cfg, struct socket *xsk, uint32_t num) +{ + uint32_t ret, idx_fq = 0; + uint64_t addr = 0; + + ret = xsk_ring_prod__reserve(&xsk->fill, num, &idx_fq); + while (ret != num) { + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->fill)) { +#ifdef STATS + xsk->app_stats.fill_fail_polls++; +#endif + recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); + } + ret = xsk_ring_prod__reserve(&xsk->fill, num, &idx_fq); + } + + for (uint32_t i = 0; i < num; i++) { + while (!flash_pool__get(xsk->flash_pool, &addr)) { + __complete_tx_completions(cfg, xsk); + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { +#ifdef STATS + xsk->app_stats.tx_wakeup_sendtos++; +#endif + __kick_tx(xsk); + } + } + *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = addr; + } + + xsk_ring_prod__submit(&xsk->fill, num); +} + +size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nrecv) +{ + int ret; + uint64_t *pkt; + uint64_t addr, orig; + const struct xdp_desc *desc; + uint32_t rcvd, nb, i, len, eop_cnt = 0, idx_rx = 0; + + /* Ensures that rx can happen during tx pressure */ + __complete_tx_completions(cfg, xsk); + + if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { + ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, cfg->xsk->poll_timeout); + if (ret <= 0) { + xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); + return 0; + } + } + + nb = nrecv > cfg->xsk->batch_size ? cfg->xsk->batch_size : nrecv; + + rcvd = xsk_ring_cons__peek(&xsk->rx, nb, &idx_rx); + if (!rcvd) { + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->fill)) { +#ifdef STATS + xsk->app_stats.rx_empty_polls++; +#endif + recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); + } + + if (cfg->smart_poll && cfg->xsk->idle_timeout && !xsk->idle_timestamp) + xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); + + return 0; + } + + if (cfg->smart_poll && rcvd >= cfg->xsk->idle_thres) + xsk->idle_timestamp = 0; + + if (rcvd > cfg->xsk->batch_size) + log_warn("errno: %d/\"%s\"", errno, strerror(errno)); + + for (i = 0; i < rcvd; i++) { + desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + eop_cnt += IS_EOP_DESC(desc->options); + addr = desc->addr; + len = desc->len; + orig = addr; + + addr = xsk_umem__add_offset_to_addr(addr); + pkt = xsk_umem__get_data(cfg->umem->buffer, addr); + + xskvecs[i].data = pkt; + xskvecs[i].len = len; + xskvecs[i].addr = orig; + xskvecs[i].options = desc->options; + + __hex_dump(pkt, len, addr); + } + + if (!cfg->rx_first) + __replenish_fill_ring(cfg, xsk, rcvd); + + xsk_ring_cons__release(&xsk->rx, rcvd); +#ifdef STATS + xsk->ring_stats.rx_npkts += eop_cnt; + xsk->ring_stats.rx_frags += rcvd; +#endif + return rcvd; +} + +size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend) { uint32_t i; uint32_t frags_done = 0, eop_cnt = 0; @@ -348,7 +546,54 @@ size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec **ms return nsend; } -size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop) +size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nsend) +{ + bool eop; + uint64_t addr; + struct xskvec *xv; + struct xdp_desc *tx_desc; + uint32_t i, idx_tx, len, frags_done = 0, eop_cnt = 0, nb_frags = 0; + + if (!nsend) + return 0; + + idx_tx = __reserve_tx(cfg, xsk, nsend); + + for (i = 0; i < nsend; i++) { + xv = &xskvecs[i]; + eop = IS_EOP_DESC(xv->options); + addr = xv->addr; + len = xv->len; + nb_frags++; + + tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++); + + tx_desc->options = eop ? 0 : XDP_PKT_CONTD; + tx_desc->options |= (xv->options & 0xFFFF0000); + tx_desc->addr = addr; + tx_desc->len = len; + + __hex_dump(xv->data, len, addr); + + if (eop) { + frags_done += nb_frags; + nb_frags = 0; + eop_cnt++; + } + } + xsk_ring_prod__submit(&xsk->tx, frags_done); + xsk->outstanding_tx += frags_done; + + if (!cfg->rx_first) + __complete_tx_completions(cfg, xsk); +#ifdef STATS + xsk->ring_stats.tx_npkts += eop_cnt; + xsk->ring_stats.tx_frags += nsend; +#endif + return nsend; +} + +size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop) { uint32_t i; uint32_t eop_cnt = 0; diff --git a/lib/flash/nf/meson.build b/lib/flash/nf/meson.build index b871c01..684af69 100644 --- a/lib/flash/nf/meson.build +++ b/lib/flash/nf/meson.build @@ -4,7 +4,7 @@ sources = files('flash_nf.c', 'flash_stats.c', 'flash_txrx.c', 'flash_txrx_dev.c') headers = files('flash_nf.h') -deps += [uds, common] +deps += [uds, common, pool] libnf = library(libname, sources, install: true, dependencies: deps) nf = declare_dependency(link_with: libnf, include_directories: include_directories('.')) diff --git a/lib/flash/params/flash_params.c b/lib/flash/params/flash_params.c index 8649dc7..0883a4c 100644 --- a/lib/flash/params/flash_params.c +++ b/lib/flash/params/flash_params.c @@ -11,38 +11,41 @@ #define BUFSIZE 30 -const char *__doc__ = "FLASH AF_XDP NF Library\n"; - const struct option_wrapper long_options[] = { - { { "umem-id", required_argument, NULL, 'u' }, "Umem id to connect to monitor" }, + { { "help", no_argument, NULL, 'h' }, "Show help", false }, - { { "nf-id", required_argument, NULL, 'f' }, "NF id to connect to monitor" }, + { { "umem-id", required_argument, NULL, 'u' }, "umem id to connect to monitor", "", true }, - { { "app-stats", no_argument, NULL, 'a' }, "Display application (syscall) statistics. (default: disabled)" }, + { { "nf-id", required_argument, NULL, 'f' }, "nf id to connect to monitor", "", true }, - { { "extra-stats", no_argument, NULL, 'x' }, "Display extra (xdp) statistics. (default: disabled)" }, + { { "tx-first", no_argument, NULL, 't' }, "TX without receiving any packets [default: disabled]" }, - { { "interval", required_argument, NULL, 'n' }, "Specify statistics update interval (default: 1 sec)." }, + { { "app-stats", no_argument, NULL, 'a' }, "Display application (syscall) statistics [default: disabled]" }, - { { "quiet", no_argument, NULL, 'Q' }, "Quiet mode (no output) (default: disabled)" }, + { { "extra-stats", no_argument, NULL, 'x' }, "Display extra (xdp) statistics [default: disabled]" }, - { { "smart-poll", no_argument, NULL, 'p' }, "Smart polling mode (default: disabled)" }, + { { "interval", required_argument, NULL, 'n' }, "Specify statistics update interval [default: 1 sec]", "" }, - { { "idle-timeout", required_argument, NULL, 'i' }, "Idle timeout for smart polling mode in ms. (default: 100)" }, + { { "quiet", no_argument, NULL, 'Q' }, "Quiet mode (no output) [default: disabled]" }, - { { "idleness", required_argument, NULL, 'I' }, "Idleness for smart polling, busy-polling (0) to poll (1) (default: 0)" }, + { { "smart-poll", no_argument, NULL, 'p' }, "Smart polling mode [default: disabled]" }, - { { "bp-timeout", required_argument, NULL, 'b' }, "Sleep duration on backpressure in us (default: 1000)" }, + { { "idle-timeout", required_argument, NULL, 'i' }, "Idle timeout for smart polling mode in ms [default: 100]", "" }, - { { "bp-sense", required_argument, NULL, 'B' }, - "Sensitivity for detecting backpressure, 0: 0 pkts - 1: 2048 pkts (default: 0.5)" }, + { { "idleness", required_argument, NULL, 'I' }, + "Idleness for smart polling, busy-polling (0) to poll (1) [default: 0]", + "" }, - { { "frags", no_argument, NULL, 'F' }, "Enable frags (multi-buffer) support. -- not implemented yet" }, + { { "timeout", required_argument, NULL, 'b' }, "Sleep duration on backpressure in us [default: 1000]", "" }, - { { "clock", required_argument, NULL, 'w' }, "Clock NAME (default MONOTONIC). -- not implemented yet" }, + { { "bp-sense", required_argument, NULL, 'B' }, + "Sensitivity for detecting backpressure, 0: 0 pkts - 1: 2048 pkts [default: 0.5]", + "" }, - { { "help", no_argument, NULL, 'h' }, "Show help", false }, + { { "frags", no_argument, NULL, 'F' }, "Enable frags (multi-buffer) support -- not implemented yet", false }, + + { { "clock", required_argument, NULL, 'w' }, "Clock NAME (default MONOTONIC) -- not implemented yet", "", false }, { { 0, 0, NULL, 0 }, NULL, false } }; @@ -102,25 +105,38 @@ static void _print_options(const struct option_wrapper *long_options, bool requi } } -static void usage(const char *prog_name, const char *doc, const struct option_wrapper *long_options, bool full) +static void usage(const char *prog_name, const struct option_wrapper *long_options, bool full, struct config *cfg) { - printf("Usage: %s [options]\n", prog_name); + printf("\nUsage: %s [options] -- [app-options]\n", prog_name); if (!full) { printf("Use --help (or -h) to see full option list.\n"); return; } - printf("\nDOCUMENTATION:\n %s\n", doc); + printf("\n"); + if (cfg->app_name) + printf("%s using FLASH AF_XDP Library\n\n", cfg->app_name); + else + printf("FLASH AF_XDP Library\n\n"); printf("Required options:\n"); _print_options(long_options, true); printf("\n"); printf("Other options:\n"); _print_options(long_options, false); printf("\n"); + if (cfg->app_options) { + printf("Application options:\n"); + for (int i = 0; cfg->app_options[i]; i++) { + printf(" %s\n", cfg->app_options[i]); + } + printf("\n"); + } + + printf("For more help on how to use FLASH, head to https://github.com/networkedsystemsIITB/flash\n\n"); } -static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper *options_wrapper, struct config *cfg, const char *doc) +static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper *options_wrapper, struct config *cfg) { int opt, ret; int longindex = 0; @@ -134,12 +150,12 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper optind = 1; if (option_wrappers_to_options(options_wrapper, &long_options)) { - log_error("ERROR: (Parsing error) Unable to malloc()\n"); - exit(EXIT_FAILURE); + log_error("ERROR: (Parsing error) Unable to malloc()"); + return -1; } /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "u:f:axn:Qpi:I:b:B:Fw:h", long_options, &longindex)) != -1) { + while ((opt = getopt_long(argc, argv, "u:f:taxn:Qpi:I:b:B:Fw:h", long_options, &longindex)) != -1) { switch (opt) { case 'u': cfg->umem_id = atoi(optarg); @@ -147,6 +163,9 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper case 'f': cfg->nf_id = atoi(optarg); break; + case 't': + cfg->rx_first = false; + break; case 'a': cfg->app_stats = true; break; @@ -179,19 +198,26 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper break; case 'w': if (get_clockid(&cfg->clock, optarg)) - log_error("ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.\n", optarg); + log_warn("ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.", optarg); break; case 'h': full_help = true; /* fall-through */ default: - usage(argv[0], doc, options_wrapper, full_help); + usage(argv[0], options_wrapper, full_help, cfg); free(long_options); - exit(EXIT_FAILURE); + return -1; } } free(long_options); + /* Check for required options */ + if (cfg->umem_id < 0 || cfg->nf_id < 0) { + log_fatal("ERROR: (Parsing error) Required options missing: --umem-id and --nf-id"); + usage(argv[0], options_wrapper, (argc == 1), cfg); + return -1; + } + if (optind >= 0) argv[optind - 1] = argv[0]; @@ -206,13 +232,18 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg) { + int ret; + cfg->umem = calloc(1, sizeof(struct umem_config)); cfg->xsk = calloc(1, sizeof(struct xsk_config)); if (!cfg->xsk || !cfg->umem) { - log_error("ERROR: Memory allocation failed\n"); - exit(EXIT_FAILURE); + log_error("ERROR: Memory allocation failed"); + return -1; } + cfg->umem_id = -1; + cfg->nf_id = -1; + cfg->rx_first = true; cfg->xsk->batch_size = BATCH_SIZE; cfg->umem->frame_size = FRAME_SIZE; cfg->stats_interval = 1; @@ -226,12 +257,19 @@ int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg) cfg->xsk->bp_timeout = 1000; cfg->xsk->bp_thres = (__u32)(XSK_RING_PROD__DEFAULT_NUM_DESCS * 0.5); - int ret = parse_cmdline_args(argc, argv, long_options, cfg, __doc__); + ret = parse_cmdline_args(argc, argv, long_options, cfg); + if (ret < 0) + goto cleanup; if ((cfg->umem->frame_size & (cfg->umem->frame_size - 1))) { - log_error("ERROR: (Parsing error) --frame-size=%d is not a power of two\n", cfg->umem->frame_size); - exit(EXIT_FAILURE); + log_error("ERROR: (Parsing error) --frame-size=%d is not a power of two", cfg->umem->frame_size); + goto cleanup; } return ret; + +cleanup: + free(cfg->umem); + free(cfg->xsk); + return -1; } diff --git a/lib/flash/params/flash_params.h b/lib/flash/params/flash_params.h index 6c221bf..9ecfe6e 100644 --- a/lib/flash/params/flash_params.h +++ b/lib/flash/params/flash_params.h @@ -15,7 +15,16 @@ struct option_wrapper { bool required; }; +/** + * Parse command line arguments for the flash application. + * Allocates memory for umem and xsk configurations. + * Sets default values for various parameters. + * + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @param cfg Pointer to the configuration structure to be filled + * @return shift on success, -1 on failure; shift can be used to skip the parsed options + */ int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg); -int get_irqs(struct config *cfg); #endif diff --git a/lib/flash/pool/flash_pool.c b/lib/flash/pool/flash_pool.c new file mode 100644 index 0000000..e87997a --- /dev/null +++ b/lib/flash/pool/flash_pool.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025 Debojeet Das + */ +#include + +#include + +#include "flash_pool.h" + +struct flash_pool *flash_pool__create(int frame_size, int umem_th_offset, int umem_scale) +{ + if (umem_th_offset < 0 || umem_scale <= 0) { + log_error("Invalid parameters for flash_pool__create"); + return NULL; + } + + uint32_t nr_frames = (size_t)XSK_RING_PROD__DEFAULT_NUM_DESCS * (size_t)2 * (size_t)umem_scale; + + struct flash_pool *pool = (struct flash_pool *)malloc(sizeof(struct flash_pool) + nr_frames * sizeof(uint64_t)); + if (!pool) { + log_error("Memory allocation failed for flash_pool"); + return NULL; + } + + pool->head = 0; + pool->tail = 0; + pool->size = nr_frames; + + for (uint32_t i = umem_th_offset * nr_frames; i < nr_frames * (umem_th_offset + 1); i++) + pool->desc[pool->tail++] = i * frame_size; + + return pool; +} + +void flash_pool__destroy(struct flash_pool *pool) +{ + if (pool) + free(pool); +} diff --git a/lib/flash/pool/flash_pool.h b/lib/flash/pool/flash_pool.h new file mode 100644 index 0000000..fc67677 --- /dev/null +++ b/lib/flash/pool/flash_pool.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025 Debojeet Das + */ + +#ifndef __FLASH_POOL_H +#define __FLASH_POOL_H + +#include + +struct flash_pool { + volatile uint32_t head; + volatile uint32_t tail; + volatile uint32_t size; + volatile uint64_t desc[]; +}; + +static inline bool flash_pool__get(struct flash_pool *pool, uint64_t *desc) +{ + if (!pool || pool->head == pool->tail) + return false; + + *desc = pool->desc[pool->head++ & (pool->size - 1)]; + return true; +} + +static inline bool flash_pool__put(struct flash_pool *pool, uint64_t desc) +{ + if (!pool || pool->tail - pool->head >= pool->size) + return false; + + pool->desc[pool->tail++ & (pool->size - 1)] = desc; + return true; +} + +struct flash_pool *flash_pool__create(int frame_size, int umem_th_offset, int umem_scale); +void flash_pool__destroy(struct flash_pool *pool); + +#endif /* __FLASH_POOL_H */ \ No newline at end of file diff --git a/lib/flash/pool/meson.build b/lib/flash/pool/meson.build new file mode 100644 index 0000000..3a158f2 --- /dev/null +++ b/lib/flash/pool/meson.build @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Debojeet Das + +sources = files('flash_pool.c') +headers = files('flash_pool.h') + +deps += [] + +libpool = library(libname, sources, install: true, dependencies: deps) +pool = declare_dependency(link_with: libpool, include_directories: include_directories('.')) + +flash_libs += pool \ No newline at end of file diff --git a/lib/include/flash_defines.h b/lib/include/flash_defines.h index 464c6c3..6c5b70f 100644 --- a/lib/include/flash_defines.h +++ b/lib/include/flash_defines.h @@ -50,6 +50,8 @@ struct umem_config { }; struct config { + const char *app_name; + const char * const *app_options; int umem_fd; int uds_sockfd; int umem_scale; @@ -67,6 +69,7 @@ struct config { int nf_id; int umem_offset; bool frags_enabled; + bool rx_first; #ifdef STATS clockid_t clock; int verbose; @@ -169,6 +172,7 @@ struct socket { struct xsk_ring_cons comp; struct pollfd idle_fd; bool idle; + void *flash_pool; uint32_t outstanding_tx; uint64_t idle_timestamp; diff --git a/meson.build b/meson.build index 3ff18fc..c1d278c 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'flash', 'C', - version: '0.1', + version: '25.5.0', license: 'BSD', default_options: [ 'buildtype=release', From 4328b292acea3ef72c793c1e56f1cc37e2c5d0b0 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 12 Jun 2025 23:47:37 +0530 Subject: [PATCH 02/43] fix: updated helloworld nf --- examples/helloworld/main.c | 81 ++++++++++++++++++++++++++++++-------- meson.build | 2 +- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/examples/helloworld/main.c b/examples/helloworld/main.c index fcbdbab..0e75fa1 100644 --- a/examples/helloworld/main.c +++ b/examples/helloworld/main.c @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright (c) 2025 Debojeet Das * - * helloworld: A simple helloworld NF that shows control plane setup using Flash monitor + * helloworld: A simple helloworld NF that shows how to parse args for application + * and control plane setup using Flash monitor */ #include @@ -12,40 +13,88 @@ #include #include -bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; -static void int_exit(int sig) +// clang-format off +static const char *hw_options[] = { + "-n \tprint count", + "-u \t\tprint hello universe", + NULL +}; +// clang-format on + +struct appconf { + int count; + bool universe; +} app_conf; + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { - log_info("Received Signal: %d", sig); - done = true; + int c; + opterr = 0; + + app_conf->count = 1; + app_conf->universe = false; + + argc -= shift; + argv += shift; + + while ((c = getopt(argc, argv, "hn:u")) != -1) + switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; + case 'n': + app_conf->count = atoi(optarg); + break; + case 'u': + app_conf->universe = true; + break; + default: + printf("Usage: %s -h\n", argv[-shift]); + return -1; + } + + return 0; } int main(int argc, char **argv) { + int shift; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { - log_error("ERROR: Memory allocation failed\n"); + log_error("ERROR: Memory allocation failed"); exit(EXIT_FAILURE); } - flash__parse_cmdline_args(argc, argv, cfg); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Hello World Application"; + cfg->app_options = hw_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; - log_info("Control Plane Setup Done"); + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - signal(SIGABRT, int_exit); + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; - log_info("All Setup Done!"); - log_info("Hello, World!"); + log_info("Control Plane setup done..."); + + const char *message = app_conf.universe ? "Hello Universe!" : "Hello World!"; + for (int i = 0; i < app_conf.count; i++) + log_info("%s", message); flash__xsk_close(cfg, nf); log_info("Control plane setup is working"); return EXIT_SUCCESS; + +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/meson.build b/meson.build index c1d278c..f927ae0 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'flash', 'C', - version: '25.5.0', + version: '25.5.1', license: 'BSD', default_options: [ 'buildtype=release', From 6a888996663263b8f75df9bea8c800a4a296d625 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Fri, 13 Jun 2025 00:30:52 +0530 Subject: [PATCH 03/43] fix: updated simplefwd and other nfs --- examples/helloworld/main.c | 15 +-- examples/l2fwd/main.c | 7 +- examples/simplefwd/main.c | 197 +++++++++++++++++++------------------ meson.build | 2 +- 4 files changed, 111 insertions(+), 110 deletions(-) diff --git a/examples/helloworld/main.c b/examples/helloworld/main.c index 0e75fa1..c666df9 100644 --- a/examples/helloworld/main.c +++ b/examples/helloworld/main.c @@ -4,11 +4,6 @@ * helloworld: A simple helloworld NF that shows how to parse args for application * and control plane setup using Flash monitor */ - -#include -#include -#include - #include #include #include @@ -16,6 +11,11 @@ struct config *cfg = NULL; struct nf *nf = NULL; +struct appconf { + int count; + bool universe; +} app_conf; + // clang-format off static const char *hw_options[] = { "-n \tprint count", @@ -24,11 +24,6 @@ static const char *hw_options[] = { }; // clang-format on -struct appconf { - int count; - bool universe; -} app_conf; - static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index 69de38e..c3a86b4 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -4,12 +4,9 @@ * l2fwd: A simple NF that forwards packets between two interfaces * after swapping or modifying MAC addresses. */ - #include #include #include -#include -#include #include #include @@ -136,8 +133,8 @@ static void swap_mac_addresses(void *data) struct sock_args { int socket_id; - int *next; - int next_size; + // int *next; + // int next_size; }; static void *socket_routine(void *arg) diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index 30799e7..8f2d779 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -3,12 +3,8 @@ * * simplefwd: A simple NF that forwards packets without modification */ - #include #include -#include -#include -#include #include #include @@ -16,7 +12,7 @@ bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; static void int_exit(int sig) { @@ -30,12 +26,20 @@ struct appconf { int stats_cpu; } app_conf; -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +// clang-format off +static const char *l2fwd_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + NULL +}; +// clang-format on + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; - // Default values app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; @@ -43,8 +47,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -55,164 +62,166 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->stats_cpu = atoi(optarg); break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + + return 0; } -static void do_noting(void *data) +static void do_nothing(void *data) { /* This is stupid but it makes sure that compiler doesn't through any errors */ (void)data; } -struct Args { +struct sock_args { int socket_id; - int *next; - int next_size; }; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - log_info("SOCKET_ID: %d", socket_id); - static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct xskvec *xskvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, nb_frags = 0; + struct sock_args *a = (struct sock_args *)arg; - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; - fds[0].fd = nf->thread[socket_id]->socket->fd; + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate send array"); + return NULL; + } + + fds[0].fd = xsk->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - bool eop = IS_EOP_DESC(xv->options); - - char *pkt = xv->data; + char *pkt = xskvecs[i].data; if (!nb_frags++) - do_noting(pkt); + do_nothing(pkt); - send[tot_pkt_send++] = &msg.msg_iov[i]; - if (eop) + if (IS_EOP_DESC(xskvecs[i].options)) nb_frags = 0; } if (nrecv) { - ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - if (ret != nrecv) { - log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + nsend = flash__sendmsg(cfg, xsk, xskvecs, nrecv); + if (nsend != nrecv) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + break; } } if (done) break; } - free(msg.msg_iov); - return NULL; -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } + free(xskvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { - log_error("ERROR: Memory allocation failed\n"); + log_error("ERROR: Memory allocation failed"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "L2 Forwarding Application"; + cfg->app_options = l2fwd_options; - log_info("Control Plane Setup Done"); + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); - log_info("STARTING Data Path"); - - for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + log_info("Starting Data Path..."); - log_info("2_NEXT_SIZE: %d", args->next_size); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg; + } - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); - flash__wait(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + free(args); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/meson.build b/meson.build index f927ae0..e511e7c 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'flash', 'C', - version: '25.5.1', + version: '25.5-beta', license: 'BSD', default_options: [ 'buildtype=release', From 12f47a628645dd012bf16d0b19e58662e25043ef Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Fri, 13 Jun 2025 01:24:16 +0530 Subject: [PATCH 04/43] style: typo fix and formatting --- examples/l2fwd/main.c | 2 +- examples/simplefwd/main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index c3a86b4..ec7536d 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -152,7 +152,7 @@ static void *socket_routine(void *arg) xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); if (!xskvecs) { - log_error("Failed to allocate send array"); + log_error("Failed to allocate xskvecs array"); return NULL; } diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index 8f2d779..484eb15 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -94,7 +94,7 @@ static void *socket_routine(void *arg) xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); if (!xskvecs) { - log_error("Failed to allocate send array"); + log_error("Failed to allocate xskvecs array"); return NULL; } From 31cbd545bda09125de07edef37ea3c1507390b46 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Fri, 13 Jun 2025 01:26:56 +0530 Subject: [PATCH 05/43] feat: fwddrop unit-test is upgraded - supports new pool API - you can provide fwd ratio percentage --- examples/unit-tests/fwddrop.c | 317 +++++++++++++++++++++------------- lib/flash/nf/flash_nf.h | 12 ++ lib/flash/nf/flash_txrx.c | 30 ++++ 3 files changed, 241 insertions(+), 118 deletions(-) diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index 070fd48..2b485a2 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -1,26 +1,19 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright (c) 2025 Debojeet Das * - * fwddrop: unit-test to check forward and drop capabilities of Flash framework - * We store pointers to msg.iov we want to drop in one array, and those we wish to send in another array + * fwddrop: unit-test to check forward and drop capabilities of Flash library */ - -#include -#include - #include #include #include -#include -#include -#include -#include -#include + +#include +#include #include bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; static void int_exit(int sig) { @@ -32,23 +25,64 @@ struct appconf { int cpu_start; int cpu_end; int stats_cpu; + int fwd_ratio; + bool sriov; + uint8_t *dest_ether_addr_octet; } app_conf; -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int hex2int(char ch) +{ + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + return -1; +} + +static uint8_t *get_mac_addr(char *mac_addr) +{ + uint8_t *dest_ether_addr_octet = (uint8_t *)malloc(6 * sizeof(uint8_t)); + for (int i = 0; i < 6; i++) { + dest_ether_addr_octet[i] = hex2int(mac_addr[0]) * 16; + mac_addr++; + dest_ether_addr_octet[i] += hex2int(mac_addr[0]); + mac_addr += 2; + } + return dest_ether_addr_octet; +} + +// clang-format off +static const char *fwddrop_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-r \tForward ratio percentage (default: 50)", + "-S \tEnable SR-IOV mode and set dest MAC address", + NULL +}; +// clang-format on + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; - // Default values app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; + app_conf->sriov = false; + app_conf->fwd_ratio = 50; argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:r:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -58,179 +92,226 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int case 's': app_conf->stats_cpu = atoi(optarg); break; + case 'r': + app_conf->fwd_ratio = atoi(optarg); + if (app_conf->fwd_ratio < 0) + app_conf->fwd_ratio = 0; + if (app_conf->fwd_ratio > 100) + app_conf->fwd_ratio = 100; + break; + case 'S': + app_conf->dest_ether_addr_octet = get_mac_addr(optarg); + app_conf->sriov = true; + break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + + return 0; +} + +static void update_dest_mac(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp = { + .ether_addr_octet = { + app_conf.dest_ether_addr_octet[0], + app_conf.dest_ether_addr_octet[1], + app_conf.dest_ether_addr_octet[2], + app_conf.dest_ether_addr_octet[3], + app_conf.dest_ether_addr_octet[4], + app_conf.dest_ether_addr_octet[5], + }, + }; + *dst_addr = tmp; } -struct Args { +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +struct sock_args { int socket_id; - int *next; - int next_size; }; -unsigned int count = 1; - static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - log_info("SOCKET_ID: %d", socket_id); - int i, ret, nfds = 1, nrecv; + int ret; + nfds_t nfds = 1; + struct socket *xsk; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + uint32_t i, nrecv, wsend, nsend, wdrop, ndrop, pcount, nb_frags = 0; + struct sock_args *a = (struct sock_args *)arg; - log_info("2_NEXT_SIZE: %d", next_size); + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); - - fds[0].fd = nf->thread[socket_id]->socket->fd; - fds[0].events = POLLIN; - for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("Failed to allocate sendvecs array"); + return NULL; + } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("Failed to allocate dropvecs array"); + return NULL; + } - struct xskvec *drop[nrecv]; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_drop = 0; - unsigned int tot_pkt_send = 0; + fds[0].fd = xsk->fd; + fds[0].events = POLLIN; + for (;;) { + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + wsend = 0; + wdrop = 0; + pcount = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - void *data = xv->data; + char *pkt = xskvecs[i].data; - uint8_t tmp_mac[ETH_ALEN]; - struct ethhdr *eth = (struct ethhdr *)data; + if (!nb_frags++) + app_conf.sriov ? update_dest_mac(pkt) : swap_mac_addresses(pkt); - memcpy(tmp_mac, eth->h_dest, ETH_ALEN); - memcpy(eth->h_dest, eth->h_source, ETH_ALEN); - memcpy(eth->h_source, tmp_mac, ETH_ALEN); + if (IS_EOP_DESC(xskvecs[i].options)) + nb_frags = 0; - /* fwd 50% packets and drop 50% packets */ - if (count == 1) { - send[tot_pkt_send++] = &msg.msg_iov[i]; - count = 0; + if ((int)(pcount * 100 / nrecv) < app_conf.fwd_ratio) { + sendvecs[wsend++] = xskvecs[i]; } else { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - count = 1; + dropvecs[wdrop++] = xskvecs[i]; } + pcount++; } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { - log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (nsend != wsend || ndrop != wdrop) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + break; } } if (done) break; } - free(msg.msg_iov); - return NULL; -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } + free(xskvecs); + free(sendvecs); + free(dropvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { - log_error("ERROR: Memory allocation failed\n"); + log_error("ERROR: Memory allocation failed"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Unit Test: Forward and Drop Application"; + cfg->app_options = fwddrop_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; - log_info("Control Plane Setup Done"); + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); - log_info("STARTING Data Path"); - - for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + log_info("Starting Data Path..."); - log_info("2_NEXT_SIZE: %d", args->next_size); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg; + } - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); - flash__wait(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + free(args); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index ee0f1bb..5fafeac 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -105,6 +105,18 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk */ size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nsend); +/** + * Drop messages from the socket. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param xskvecs: Pointer to the array of xskvec structures containing data to drop. + * @param ndrop: Number of messages to drop. + * + * @return Number of messages dropped, or 0 if no messages were dropped. + */ +size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t ndrop); + int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout); size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg); size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend); diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index fb4bcd7..2e4f467 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -618,3 +618,33 @@ size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec * #endif return ndrop; } + +size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t ndrop) +{ + uint32_t i, idx_fq; + uint64_t addr; + + if (!ndrop) + return 0; + + if (cfg->rx_first) { + idx_fq = __reserve_fq(cfg, xsk, ndrop); + + for (i = 0; i < ndrop; i++) { + addr = xsk_umem__extract_addr(xskvecs[i].addr); + *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = addr; + } + + xsk_ring_prod__submit(&xsk->fill, ndrop); + } else { + for (i = 0; i < ndrop; i++) { + addr = xsk_umem__extract_addr(xskvecs[i].addr); + flash_pool__put(xsk->flash_pool, addr); + } + } + +#ifdef STATS + xsk->ring_stats.drop_npkts += ndrop; +#endif + return ndrop; +} From cacf53b33a94805f22d9688d01417c903fba3c9c Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Sat, 14 Jun 2025 19:53:58 +0530 Subject: [PATCH 06/43] fix: robust smart_poll trigger - smart poll will be blocked untill packet is received, unlike previous implementation which was configurable. - if there are pending transmissions the data path will not be blocked. --- lib/flash/nf/flash_txrx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 2e4f467..314a817 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -342,7 +342,7 @@ size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghd __complete_tx_rx_first(cfg, xsk); if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { - ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, cfg->xsk->poll_timeout); + ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, -1); if (ret <= 0) { xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); return 0; @@ -364,7 +364,7 @@ size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghd return 0; } - if (cfg->smart_poll && rcvd >= cfg->xsk->idle_thres) + if (cfg->smart_poll && (rcvd >= cfg->xsk->idle_thres || xsk->outstanding_tx)) xsk->idle_timestamp = 0; if (rcvd > cfg->xsk->batch_size) { @@ -444,7 +444,7 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk __complete_tx_completions(cfg, xsk); if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { - ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, cfg->xsk->poll_timeout); + ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, -1); if (ret <= 0) { xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); return 0; @@ -468,7 +468,7 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk return 0; } - if (cfg->smart_poll && rcvd >= cfg->xsk->idle_thres) + if (cfg->smart_poll && (rcvd >= cfg->xsk->idle_thres || xsk->outstanding_tx)) xsk->idle_timestamp = 0; if (rcvd > cfg->xsk->batch_size) From 6baf4dbe89a6bf1de03724da1a939d7ecac4cd8a Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 16 Jun 2025 12:23:20 +0530 Subject: [PATCH 07/43] fix: fwdrr unit-test is upgraded --- examples/unit-tests/fwdrr.c | 256 ++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 126 deletions(-) diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index 4d61571..58587e4 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -3,12 +3,9 @@ * * fwdrr: A simple NF that forwards packets to many destinations in a round-robin fashion */ - #include #include #include -#include -#include #include #include @@ -16,7 +13,7 @@ bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; static void int_exit(int sig) { @@ -29,38 +26,25 @@ struct appconf { int cpu_end; int stats_cpu; bool sriov; - uint8_t *dest_ether_addr_octet; + uint8_t dest_ether_addr_octet[6]; } app_conf; -static int hex2int(char ch) -{ - if (ch >= '0' && ch <= '9') - return ch - '0'; - if (ch >= 'A' && ch <= 'F') - return ch - 'A' + 10; - if (ch >= 'a' && ch <= 'f') - return ch - 'a' + 10; - return -1; -} - -static uint8_t *get_mac_addr(char *mac_addr) -{ - uint8_t *dest_ether_addr_octet = (uint8_t *)malloc(6 * sizeof(uint8_t)); - for (int i = 0; i < 6; i++) { - dest_ether_addr_octet[i] = hex2int(mac_addr[0]) * 16; - mac_addr++; - dest_ether_addr_octet[i] += hex2int(mac_addr[0]); - mac_addr += 2; - } - return dest_ether_addr_octet; -} +// clang-format off +static const char *fwdrr_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tEnable SR-IOV mode and set dest MAC address", + NULL +}; +// clang-format on -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; + int ethaddr[6]; opterr = 0; - // Default values app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; @@ -69,8 +53,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -81,12 +68,21 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->stats_cpu = atoi(optarg); break; case 'S': - app_conf->dest_ether_addr_octet = get_mac_addr(optarg); + if (sscanf(optarg, "%x:%x:%x:%x:%x:%x", ðaddr[0], ðaddr[1], ðaddr[2], ðaddr[3], ðaddr[4], + ðaddr[5]) != 6) { + log_error("Invalid MAC address format: %s", optarg); + return -1; + } + for (int i = 0; i < 6; i++) + app_conf->dest_ether_addr_octet[i] = (uint8_t)ethaddr[i]; app_conf->sriov = true; break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + + return 0; } static void update_dest_mac(void *data) @@ -94,15 +90,15 @@ static void update_dest_mac(void *data) struct ether_header *eth = (struct ether_header *)data; struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; struct ether_addr tmp = { - .ether_addr_octet = { - app_conf.dest_ether_addr_octet[0], - app_conf.dest_ether_addr_octet[1], - app_conf.dest_ether_addr_octet[2], - app_conf.dest_ether_addr_octet[3], - app_conf.dest_ether_addr_octet[4], - app_conf.dest_ether_addr_octet[5], - }, - }; + .ether_addr_octet = { + app_conf.dest_ether_addr_octet[0], + app_conf.dest_ether_addr_octet[1], + app_conf.dest_ether_addr_octet[2], + app_conf.dest_ether_addr_octet[3], + app_conf.dest_ether_addr_octet[4], + app_conf.dest_ether_addr_octet[5], + }, + }; *dst_addr = tmp; } @@ -118,7 +114,7 @@ static void swap_mac_addresses(void *data) *dst_addr = tmp; } -struct Args { +struct sock_args { int socket_id; int *next; int next_size; @@ -126,159 +122,167 @@ struct Args { static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - // free(arg); - log_info("SOCKET_ID: %d", socket_id); - static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + int *next; + nfds_t nfds = 1; + int ret, next_size; + struct socket *xsk; + struct xskvec *xskvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, count, nb_frags = 0; + struct sock_args *a = (struct sock_args *)arg; - log_info("2_NEXT_SIZE: %d", next_size); + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; + next = a->next; + next_size = a->next_size; for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + log_debug("Next Item [%d] ::: %d", i, next[i]); } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; - unsigned int count = 0; + + count = 0; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - bool eop = IS_EOP_DESC(xv->options); + char *pkt = xskvecs[i].data; if (next_size != 0) { - xv->options = ((count % next_size) << 16) | (xv->options & 0xFFFF); + xskvecs[i].options = ((count % next_size) << 16) | (xskvecs[i].options & 0xFFFF); count++; } - char *pkt = xv->data; if (!nb_frags++) app_conf.sriov ? update_dest_mac(pkt) : swap_mac_addresses(pkt); - send[tot_pkt_send++] = &msg.msg_iov[i]; - if (eop) + if (IS_EOP_DESC(xskvecs[i].options)) nb_frags = 0; } if (nrecv) { - ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - if (ret != nrecv) { - log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + nsend = flash__sendmsg(cfg, xsk, xskvecs, nrecv); + if (nsend != nrecv) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + break; } } if (done) break; } - free(msg.msg_iov); - return NULL; -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } + free(xskvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { - log_error("ERROR: Memory allocation failed\n"); + log_error("ERROR: Memory allocation failed"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Round-Robin Forwarding Application"; + cfg->app_options = fwdrr_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; - log_info("Control Plane Setup Done"); + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); - log_info("STARTING Data Path"); + log_info("Starting Data Path..."); + + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args[i].socket_id = i; + args[i].next = nf->next; + args[i].next_size = nf->next_size; - log_info("2_NEXT_SIZE: %d", args->next_size); + log_debug("Next Size ::: %d", args[i].next_size); - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + for (int i = 0; i < args[i].next_size; i++) + log_debug("Next Item [%d] ::: %d", i, nf->next[i]); - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { - log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); - flash__wait(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + free(args); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } From aada6a91b4d6378142c005b0759cf94e72a40dda Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 16 Jun 2025 12:24:18 +0530 Subject: [PATCH 08/43] refactor: inline mac-address computation --- examples/l2fwd/main.c | 43 +++++++--------------------------- examples/unit-tests/fwddrop.c | 44 ++++++++--------------------------- 2 files changed, 19 insertions(+), 68 deletions(-) diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index ec7536d..30178a1 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -27,32 +27,9 @@ struct appconf { int cpu_end; int stats_cpu; bool sriov; - uint8_t *dest_ether_addr_octet; + uint8_t dest_ether_addr_octet[6]; } app_conf; -static int hex2int(char ch) -{ - if (ch >= '0' && ch <= '9') - return ch - '0'; - if (ch >= 'A' && ch <= 'F') - return ch - 'A' + 10; - if (ch >= 'a' && ch <= 'f') - return ch - 'a' + 10; - return -1; -} - -static uint8_t *get_mac_addr(char *mac_addr) -{ - uint8_t *dest_ether_addr_octet = (uint8_t *)malloc(6 * sizeof(uint8_t)); - for (int i = 0; i < 6; i++) { - dest_ether_addr_octet[i] = hex2int(mac_addr[0]) * 16; - mac_addr++; - dest_ether_addr_octet[i] += hex2int(mac_addr[0]); - mac_addr += 2; - } - return dest_ether_addr_octet; -} - // clang-format off static const char *l2fwd_options[] = { "-c \tStart CPU (default: 0)", @@ -66,6 +43,7 @@ static const char *l2fwd_options[] = { static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; + int ethaddr[6]; opterr = 0; app_conf->cpu_start = 0; @@ -91,7 +69,13 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s app_conf->stats_cpu = atoi(optarg); break; case 'S': - app_conf->dest_ether_addr_octet = get_mac_addr(optarg); + if (sscanf(optarg, "%x:%x:%x:%x:%x:%x", ðaddr[0], ðaddr[1], ðaddr[2], ðaddr[3], ðaddr[4], + ðaddr[5]) != 6) { + log_error("Invalid MAC address format: %s", optarg); + return -1; + } + for (int i = 0; i < 6; i++) + app_conf->dest_ether_addr_octet[i] = (uint8_t)ethaddr[i]; app_conf->sriov = true; break; default: @@ -133,8 +117,6 @@ static void swap_mac_addresses(void *data) struct sock_args { int socket_id; - // int *next; - // int next_size; }; static void *socket_routine(void *arg) @@ -234,13 +216,6 @@ int main(int argc, char **argv) for (int i = 0; i < cfg->total_sockets; i++) { args[i].socket_id = i; - // args[i].next = nf->next; - // args[i].next_size = nf->next_size; - - // log_debug("Next Size ::: %d", args[i].next_size); - - // for (int i = 0; i < args[i].next_size; i++) - // log_debug("Next Item [%d] ::: %d", i, nf->next[i]); if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index 2b485a2..0c43725 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -27,32 +27,9 @@ struct appconf { int stats_cpu; int fwd_ratio; bool sriov; - uint8_t *dest_ether_addr_octet; + uint8_t dest_ether_addr_octet[6]; } app_conf; -static int hex2int(char ch) -{ - if (ch >= '0' && ch <= '9') - return ch - '0'; - if (ch >= 'A' && ch <= 'F') - return ch - 'A' + 10; - if (ch >= 'a' && ch <= 'f') - return ch - 'a' + 10; - return -1; -} - -static uint8_t *get_mac_addr(char *mac_addr) -{ - uint8_t *dest_ether_addr_octet = (uint8_t *)malloc(6 * sizeof(uint8_t)); - for (int i = 0; i < 6; i++) { - dest_ether_addr_octet[i] = hex2int(mac_addr[0]) * 16; - mac_addr++; - dest_ether_addr_octet[i] += hex2int(mac_addr[0]); - mac_addr += 2; - } - return dest_ether_addr_octet; -} - // clang-format off static const char *fwddrop_options[] = { "-c \tStart CPU (default: 0)", @@ -67,18 +44,18 @@ static const char *fwddrop_options[] = { static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; + int ethaddr[6]; opterr = 0; app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; app_conf->sriov = false; - app_conf->fwd_ratio = 50; argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "hc:e:s:r:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) switch (c) { case 'h': printf("Usage: %s -h\n", argv[-shift]); @@ -92,15 +69,14 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s case 's': app_conf->stats_cpu = atoi(optarg); break; - case 'r': - app_conf->fwd_ratio = atoi(optarg); - if (app_conf->fwd_ratio < 0) - app_conf->fwd_ratio = 0; - if (app_conf->fwd_ratio > 100) - app_conf->fwd_ratio = 100; - break; case 'S': - app_conf->dest_ether_addr_octet = get_mac_addr(optarg); + if (sscanf(optarg, "%x:%x:%x:%x:%x:%x", ðaddr[0], ðaddr[1], ðaddr[2], ðaddr[3], ðaddr[4], + ðaddr[5]) != 6) { + log_error("Invalid MAC address format: %s", optarg); + return -1; + } + for (int i = 0; i < 6; i++) + app_conf->dest_ether_addr_octet[i] = (uint8_t)ethaddr[i]; app_conf->sriov = true; break; default: From 676a1b933031a5c103374d847e4c90390f40339f Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 16 Jun 2025 22:56:47 +0530 Subject: [PATCH 09/43] feat: error handling in UDS client - UDS APIs has error handling. - documentation added to UDS APIs - modified UDS configuration paths in flash__configure_nf() to handle errors gracefully - removed unnecessary *next in config and NFs (unused variable) - monitor will be updated with proper error handling in upcomming commits --- examples/arpresolver/main.c | 4 +- examples/maglev/main.c | 10 +- examples/simple-firewall/main.c | 6 +- examples/unit-tests/fwdrr.c | 11 -- lib/flash/nf/flash_nf.c | 216 +++++++++++++++++++++++--------- lib/flash/params/flash_params.c | 5 + lib/flash/uds/flash_uds.c | 176 +++++++++++++++----------- lib/flash/uds/flash_uds.h | 86 +++++++++++-- lib/include/flash_defines.h | 4 +- monitor/main.c | 41 +++--- 10 files changed, 373 insertions(+), 186 deletions(-) diff --git a/examples/arpresolver/main.c b/examples/arpresolver/main.c index 285f439..445fe3e 100644 --- a/examples/arpresolver/main.c +++ b/examples/arpresolver/main.c @@ -164,13 +164,13 @@ static void configure(void) { // Need to change so that we get IPS of all NFS, not just of our local dest // send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - // recv_data(cfg->uds_sockfd, &num_valid_ips, sizeof(int)); + // flash__recv_data(cfg->uds_sockfd, &num_valid_ips, sizeof(int)); // if (num_valid_ips != 1){ // printf("Arp-resolver should be ran along with ip4ping only"); // exit(1); // } // log_info("Number of Backends: %d", num_valid_ips); - // recv_data(cfg->uds_sockfd, ip4ping_ip, INET_ADDRSTRLEN); + // flash__recv_data(cfg->uds_sockfd, ip4ping_ip, INET_ADDRSTRLEN); // log_info("ip4ping_ip: %s", ip4ping_ip); // configuring src_mac diff --git a/examples/maglev/main.c b/examples/maglev/main.c index 87629b0..421ff11 100644 --- a/examples/maglev/main.c +++ b/examples/maglev/main.c @@ -185,15 +185,15 @@ struct backend_entry { static void load_services(void) { - send_cmd(cfg->uds_sockfd, FLASH__GET_IP_ADDR); - recv_data(cfg->uds_sockfd, srv_addr, INET_ADDRSTRLEN); + flash__send_cmd(cfg->uds_sockfd, FLASH__GET_IP_ADDR); + flash__recv_data(cfg->uds_sockfd, srv_addr, INET_ADDRSTRLEN); log_info("NF IP: %s", srv_addr); - send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); + flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); log_info("Number of Backends: %d", nbackends); for (int i = 0; i < nbackends; i++) { - recv_data(cfg->uds_sockfd, bkd_addr[i], INET_ADDRSTRLEN); + flash__recv_data(cfg->uds_sockfd, bkd_addr[i], INET_ADDRSTRLEN); log_info("Backend %d IP: %s", i, bkd_addr[i]); } diff --git a/examples/simple-firewall/main.c b/examples/simple-firewall/main.c index 7efbcf1..ad8ace9 100644 --- a/examples/simple-firewall/main.c +++ b/examples/simple-firewall/main.c @@ -139,13 +139,13 @@ static void read_json_config(void) static void *configure(void) { int nbackends; - send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); + flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); if (nbackends != 1) { printf("Firewall is linked to %d load balancers", nbackends); exit(1); } - recv_data(cfg->uds_sockfd, load_balancer_addr, INET_ADDRSTRLEN); + flash__recv_data(cfg->uds_sockfd, load_balancer_addr, INET_ADDRSTRLEN); read_json_config(); diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index 58587e4..e05461c 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -116,13 +116,11 @@ static void swap_mac_addresses(void *data) struct sock_args { int socket_id; - int *next; int next_size; }; static void *socket_routine(void *arg) { - int *next; nfds_t nfds = 1; int ret, next_size; struct socket *xsk; @@ -133,13 +131,8 @@ static void *socket_routine(void *arg) log_debug("Socket ID: %d", a->socket_id); xsk = nf->thread[a->socket_id]->socket; - next = a->next; next_size = a->next_size; - for (int i = 0; i < next_size; i++) { - log_debug("Next Item [%d] ::: %d", i, next[i]); - } - xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); if (!xskvecs) { log_error("Failed to allocate xskvecs array"); @@ -230,14 +223,10 @@ int main(int argc, char **argv) for (int i = 0; i < cfg->total_sockets; i++) { args[i].socket_id = i; - args[i].next = nf->next; args[i].next_size = nf->next_size; log_debug("Next Size ::: %d", args[i].next_size); - for (int i = 0; i < args[i].next_size; i++) - log_debug("Next Item [%d] ::: %d", i, nf->next[i]); - if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); goto out_args; diff --git a/lib/flash/nf/flash_nf.c b/lib/flash/nf/flash_nf.c index ce4cc90..bfa6e96 100644 --- a/lib/flash/nf/flash_nf.c +++ b/lib/flash/nf/flash_nf.c @@ -62,83 +62,178 @@ void flash__wait(struct config *cfg) static void close_uds_conn(struct config *cfg) { - send_cmd(cfg->uds_sockfd, FLASH__CLOSE_CONN); + flash__send_cmd(cfg->uds_sockfd, FLASH__CLOSE_CONN); close(cfg->uds_sockfd); return; } -static int *__configure(struct config *cfg, struct nf *nf) +static int __configure(struct config *cfg, struct nf *nf, int **received_fd) { - int uds_sockfd = start_uds_client(); - cfg->uds_sockfd = uds_sockfd; + int uds_sockfd, i; struct nf_data data; + + uds_sockfd = flash__start_uds_client(); + if (uds_sockfd < 0) { + log_error("Failed to start UDS client"); + return -1; + } + + cfg->uds_sockfd = uds_sockfd; + data.nf_id = cfg->nf_id; data.umem_id = cfg->umem_id; - send_cmd(uds_sockfd, FLASH__GET_UMEM); - send_data(uds_sockfd, &data, sizeof(struct nf_data)); - recv_fd(uds_sockfd, &cfg->umem_fd); - log_info("RECEIVED EXISTING UMEM FD"); + if (flash__send_cmd(uds_sockfd, FLASH__GET_UMEM) < 0) { + log_error("Failed to send command to get UMEM"); + goto close_uds; + } + + if (flash__send_data(uds_sockfd, &data, sizeof(struct nf_data)) < 0) { + log_error("Failed to send NF data to UDS server"); + goto close_uds; + } + + if (flash__recv_fd(uds_sockfd, &cfg->umem_fd) < 0) { + log_error("Failed to receive UMEM FD from UDS server"); + goto close_uds; + } + + log_debug("RECEIVED EXISTING UMEM FD"); + + if (flash__recv_data(uds_sockfd, &cfg->total_sockets, sizeof(int)) < 0) { + log_error("Failed to receive total sockets from UDS server"); + goto close_uds; + } + + log_debug("TOTAL SOCKETS: %d", cfg->total_sockets); + + if (flash__recv_data(uds_sockfd, &cfg->umem->size, sizeof(int)) < 0) { + log_error("Failed to receive UMEM size from UDS server"); + goto close_uds; + } + + log_debug("UMEM SIZE: %d", cfg->umem->size); + + if (flash__recv_data(uds_sockfd, &cfg->umem_scale, sizeof(int)) < 0) { + log_error("Failed to receive UMEM scale from UDS server"); + goto close_uds; + } - recv_data(uds_sockfd, &cfg->total_sockets, sizeof(int)); - log_info("TOTAL SOCKETS: %d", cfg->total_sockets); + log_debug("UMEM SCALE: %d", cfg->umem_scale); - recv_data(uds_sockfd, &cfg->umem->size, sizeof(int)); - log_info("UMEM SIZE: %d", cfg->umem->size); + if (flash__send_cmd(uds_sockfd, FLASH__GET_UMEM_OFFSET) < 0) { + log_error("Failed to send command to get UMEM offset"); + goto close_uds; + } - recv_data(uds_sockfd, &cfg->umem_scale, sizeof(int)); - log_info("UMEM SCALE: %d", cfg->umem_scale); + if (flash__recv_data(uds_sockfd, &cfg->umem_offset, sizeof(int)) < 0) { + log_error("Failed to receive UMEM offset from UDS server"); + goto close_uds; + } - send_cmd(uds_sockfd, FLASH__GET_UMEM_OFFSET); - recv_data(uds_sockfd, &cfg->umem_offset, sizeof(int)); - log_info("RECEIVED umem_offset: %d", cfg->umem_offset); + log_debug("RECEIVED umem_offset: %d", cfg->umem_offset); - int *received_fd = (int *)calloc(cfg->total_sockets, sizeof(int)); + *received_fd = (int *)calloc(cfg->total_sockets, sizeof(int)); cfg->ifqueue = (int *)calloc(cfg->total_sockets, sizeof(int)); - for (int i = 0; i < cfg->total_sockets; i++) { - send_cmd(uds_sockfd, FLASH__CREATE_SOCKET); - recv_fd(uds_sockfd, received_fd + i); - recv_data(uds_sockfd, &cfg->ifqueue[i], sizeof(int)); - log_info("RECEIVED SOCKET-%d FD-%d, bound to Queue-%d", i, received_fd[i], cfg->ifqueue[i]); + for (i = 0; i < cfg->total_sockets; i++) { + if (flash__send_cmd(uds_sockfd, FLASH__CREATE_SOCKET) < 0) { + log_error("Failed to send command to create socket"); + goto clean_rcv_fd; + } + if (flash__recv_fd(uds_sockfd, &(*received_fd)[i]) < 0) { + log_error("Failed to receive socket FD from UDS server"); + goto clean_rcv_fd; + } + + if (flash__recv_data(uds_sockfd, &cfg->ifqueue[i], sizeof(int)) < 0) { + log_error("Failed to receive ifqueue for socket %d", i); + goto clean_rcv_fd; + } + log_debug("RECEIVED SOCKET-%d FD-%d, bound to Queue-%d", i, (*received_fd)[i], cfg->ifqueue[i]); } - send_cmd(uds_sockfd, FLASH__GET_ROUTE_INFO); - recv_data(uds_sockfd, &nf->next_size, sizeof(int)); - log_info("ROUTE SIZE: %d", nf->next_size); + if (flash__send_cmd(uds_sockfd, FLASH__GET_ROUTE_INFO) < 0) { + log_error("Failed to send command to get route info"); + goto clean_rcv_fd; + } - nf->next = (int *)calloc(nf->next_size, sizeof(int)); - recv_data(uds_sockfd, nf->next, sizeof(int) * nf->next_size); - for (int i = 0; i < nf->next_size; i++) { - log_info("ROUTE ITEM-%d %d", i, nf->next[i]); + if (flash__recv_data(uds_sockfd, &nf->next_size, sizeof(int)) < 0) { + log_error("Failed to receive route size from UDS server"); + goto clean_rcv_fd; } + log_debug("ROUTE SIZE: %d", nf->next_size); - send_cmd(uds_sockfd, FLASH__GET_BIND_FLAGS); - recv_data(uds_sockfd, &cfg->xsk->bind_flags, sizeof(uint32_t)); - log_info("BIND_FLAGS: %d", cfg->xsk->bind_flags); + if (flash__send_cmd(uds_sockfd, FLASH__GET_BIND_FLAGS) < 0) { + log_error("Failed to send command to get bind flags"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->xsk->bind_flags, sizeof(uint32_t)) < 0) { + log_error("Failed to receive bind flags from UDS server"); + goto clean_rcv_fd; + } - send_cmd(uds_sockfd, FLASH__GET_XDP_FLAGS); - recv_data(uds_sockfd, &cfg->xsk->xdp_flags, sizeof(uint32_t)); - log_info("XDP_FLAGS: %d", cfg->xsk->xdp_flags); + log_debug("BIND_FLAGS: %d", cfg->xsk->bind_flags); + + if (flash__send_cmd(uds_sockfd, FLASH__GET_XDP_FLAGS) < 0) { + log_error("Failed to send command to get XDP flags"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->xsk->xdp_flags, sizeof(uint32_t)) < 0) { + log_error("Failed to receive XDP flags from UDS server"); + goto clean_rcv_fd; + } + log_debug("XDP_FLAGS: %d", cfg->xsk->xdp_flags); - send_cmd(uds_sockfd, FLASH__GET_MODE); - recv_data(uds_sockfd, &cfg->xsk->mode, sizeof(uint32_t)); - log_info("MODE: %d", cfg->xsk->mode); + if (flash__send_cmd(uds_sockfd, FLASH__GET_MODE) < 0) { + log_error("Failed to send command to get mode"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->xsk->mode, sizeof(uint32_t)) < 0) { + log_error("Failed to receive mode from UDS server"); + goto clean_rcv_fd; + } + log_debug("MODE: %d", cfg->xsk->mode); if (cfg->xsk->mode & FLASH__POLL) { - send_cmd(uds_sockfd, FLASH__GET_POLL_TIMEOUT); - recv_data(uds_sockfd, &cfg->xsk->poll_timeout, sizeof(int)); - log_info("POLL_TIMEOUT: %d", cfg->xsk->poll_timeout); + if (flash__send_cmd(uds_sockfd, FLASH__GET_POLL_TIMEOUT) < 0) { + log_error("Failed to send command to get poll timeout"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->xsk->poll_timeout, sizeof(int)) < 0) { + log_error("Failed to receive poll timeout from UDS server"); + goto clean_rcv_fd; + } + log_debug("POLL_TIMEOUT: %d", cfg->xsk->poll_timeout); } - send_cmd(uds_sockfd, FLASH__GET_FRAGS_ENABLED); - recv_data(uds_sockfd, &cfg->frags_enabled, sizeof(bool)); - log_info("FRAGS_ENABLED: %d", cfg->frags_enabled); + if (flash__send_cmd(uds_sockfd, FLASH__GET_FRAGS_ENABLED) < 0) { + log_error("Failed to send command to get frags enabled"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->frags_enabled, sizeof(bool)) < 0) { + log_error("Failed to receive frags enabled from UDS server"); + goto clean_rcv_fd; + } + log_debug("FRAGS_ENABLED: %d", cfg->frags_enabled); - send_cmd(uds_sockfd, FLASH__GET_IFNAME); - recv_data(uds_sockfd, cfg->ifname, IF_NAMESIZE); - log_info("IFNAME: %s", cfg->ifname); + if (flash__send_cmd(uds_sockfd, FLASH__GET_IFNAME) < 0) { + log_error("Failed to send command to get ifname"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, cfg->ifname, IF_NAMESIZE) < 0) { + log_error("Failed to receive ifname from UDS server"); + goto clean_rcv_fd; + } + log_debug("IFNAME: %s", cfg->ifname); + + return 0; - return received_fd; +clean_rcv_fd: + free(*received_fd); + free(cfg->ifqueue); +close_uds: + close_uds_conn(cfg); + return -1; } static int xsk_mmap_umem_rings(struct socket *socket, struct xsk_umem_config umem_config, struct xsk_socket_config xsk_config) @@ -307,7 +402,6 @@ void flash__xsk_close(struct config *cfg, struct nf *nf) if (!cfg || !nf) return; - // Corner case handling to be done properly close_uds_conn(cfg); for (int i = 0; i < cfg->total_sockets; i++) { @@ -332,7 +426,6 @@ void flash__xsk_close(struct config *cfg, struct nf *nf) } free(nf->thread); - free(nf->next); free(nf); if (cfg->umem) { @@ -345,6 +438,9 @@ void flash__xsk_close(struct config *cfg, struct nf *nf) if (cfg->xsk) free(cfg->xsk); + if (cfg->ifqueue) + free(cfg->ifqueue); + if (cfg->umem_config && cfg->xsk_config) { free(cfg->umem_config); free(cfg->xsk_config); @@ -361,19 +457,25 @@ static bool xsk_page_aligned(void *buffer) int flash__configure_nf(struct nf **_nf, struct config *cfg) { int i, size; - int *sockfd; + int *sockfd = NULL; struct nf *nf; + if (!cfg || !_nf) { + log_error("ERROR: NULL pointer as arguments"); + return -1; + } + nf = (struct nf *)calloc(1, sizeof(struct nf)); if (!nf) { log_error("ERROR: Memory allocation failed for nf"); return -1; } - // monitor communication happens here - overhaul required - // corner case handling etc. - // routing should be handled better - sockfd = __configure(cfg, nf); + if (__configure(cfg, nf, &sockfd) < 0) { + log_error("ERROR: (NF configuration) __configure failed"); + free(nf); + return -1; + } size = cfg->umem->size; cfg->umem->buffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, cfg->umem_fd, 0); @@ -436,8 +538,6 @@ int flash__configure_nf(struct nf **_nf, struct config *cfg) } } - // Is this handling correct?? - free(cfg->ifqueue); free(sockfd); *_nf = nf; return 0; diff --git a/lib/flash/params/flash_params.c b/lib/flash/params/flash_params.c index 0883a4c..346ef47 100644 --- a/lib/flash/params/flash_params.c +++ b/lib/flash/params/flash_params.c @@ -234,6 +234,11 @@ int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg) { int ret; + if (!cfg) { + log_error("ERROR: (Parsing error) NULL config pointer"); + return -1; + } + cfg->umem = calloc(1, sizeof(struct umem_config)); cfg->xsk = calloc(1, sizeof(struct xsk_config)); if (!cfg->xsk || !cfg->umem) { diff --git a/lib/flash/uds/flash_uds.c b/lib/flash/uds/flash_uds.c index 1723138..7e45e02 100644 --- a/lib/flash/uds/flash_uds.c +++ b/lib/flash/uds/flash_uds.c @@ -14,55 +14,116 @@ #include "flash_uds.h" -void send_cmd(int sockfd, int cmd) +int flash__recv_cmd(int sockfd) { - int rval = write(sockfd, &cmd, sizeof(int)); - if (rval < 0) { - log_error("Error writing stream cmd"); - exit(EXIT_FAILURE); - } + int cmd, rval; + rval = read(sockfd, &cmd, sizeof(int)); + if (rval < 0) + return -1; + + return cmd; } -void send_data(int sockfd, void *data, int size) +int flash__send_cmd(int sockfd, int cmd) { - int rval = write(sockfd, data, size); - if (rval < 0) { - log_error("Error writing stream data"); - exit(EXIT_FAILURE); - } + int rval = write(sockfd, &cmd, sizeof(int)); + if (rval < 0) + return -1; + + return rval; } -void recv_data(int sockfd, void *data, int size) +int flash__recv_data(int sockfd, void *data, int size) { int rval = read(sockfd, data, size); - if (rval < 0) { - log_error("Error reading stream data"); - exit(EXIT_FAILURE); - } + if (rval < 0) + return -1; + + return rval; } -int recv_cmd(int sockfd) +int flash__send_data(int sockfd, void *data, int size) { - int cmd, rval; - rval = read(sockfd, &cmd, sizeof(int)); - if (rval < 0) { - log_error("Error reading stream cmd"); - exit(EXIT_FAILURE); - } + int rval = write(sockfd, data, size); + if (rval < 0) + return -1; - return cmd; + return rval; } -int send_fd(int sockfd, int fd) +int flash__recv_fd(int sockfd, int *_fd) { - char cmsgbuf[CMSG_SPACE(sizeof(int))]; - struct msghdr msgh; + char cms[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg; + struct msghdr msg; struct iovec iov; char buf[1]; + int len; + + iov.iov_base = buf; + iov.iov_len = 1; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + msg.msg_control = (caddr_t)cms; + msg.msg_controllen = sizeof(cms); + + len = recvmsg(sockfd, &msg, 0); + if (len < 0) { + log_error("Recvmsg failed length incorrect."); + return -1; + } + + if (buf[0] == 'n') { + log_error("Received error message -1"); + return -1; + } else if (buf[0] == 'y') { + cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg == NULL) { + log_error("No ancillary data."); + return -1; + } + + *_fd = *(int *)CMSG_DATA(cmsg); + + return 0; + } else { + log_error("Received unknown message: %s", buf); + return -1; + } +} + +int flash__send_fd(int sockfd, int fd) +{ + int ret; + char cmsgbuf[CMSG_SPACE(sizeof(int))]; + struct msghdr msgh = { 0 }; + struct iovec iov = { 0 }; + char buf[1] = { 'y' }; if (fd == -1) { - log_error("Incorrect fd = %d\n", fd); - exit(EXIT_FAILURE); + log_debug("Sending error message -1", fd); + buf[0] = 'n'; + + iov.iov_base = buf; + iov.iov_len = 1; + + msgh.msg_name = NULL; + msgh.msg_namelen = 0; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + ret = sendmsg(sockfd, &msgh, 0); + + if (ret == -1) { + log_error("Sendmsg failed with %s", strerror(errno)); + return -1; + } + + return ret; } /* We must transmit at least 1 byte of real data in order @@ -86,17 +147,17 @@ int send_fd(int sockfd, int fd) /* Write the fd as ancillary data */ *(int *)CMSG_DATA(cmsg) = fd; - int ret = sendmsg(sockfd, &msgh, 0); + ret = sendmsg(sockfd, &msgh, 0); if (ret == -1) { log_error("Sendmsg failed with %s", strerror(errno)); - exit(EXIT_FAILURE); + return -1; } return ret; } -int start_uds_server(void) +int flash__start_uds_server(void) { int sockfd; int flag = 1; @@ -106,13 +167,13 @@ int start_uds_server(void) if (mkdir(UNIX_SOCKET_DIR, 0777) == -1 && errno != EEXIST) { log_error("Error creating directory %s: %s", UNIX_SOCKET_DIR, strerror(errno)); - exit(EXIT_FAILURE); + return -1; } sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { log_error("Error opening socket stream: %s", strerror(errno)); - exit(EXIT_FAILURE); + return -1; } unlink(UNIX_SOCKET_NAME); @@ -122,13 +183,13 @@ int start_uds_server(void) if (bind(sockfd, (struct sockaddr *)&server, sizeof(struct sockaddr_un))) { log_error("Binding to socket stream failed: %s", strerror(errno)); - exit(EXIT_FAILURE); + return -1; } return sockfd; } -int start_uds_client(void) +int flash__start_uds_client(void) { struct sockaddr_un server; int sockfd; @@ -136,7 +197,7 @@ int start_uds_client(void) sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { log_error("Error opening socket stream: %s", strerror(errno)); - exit(EXIT_FAILURE); + return -1; } server.sun_family = AF_UNIX; @@ -145,45 +206,8 @@ int start_uds_client(void) if (connect(sockfd, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { close(sockfd); log_error("Error connecting stream socket: %s", strerror(errno)); - exit(EXIT_FAILURE); + return -1; } return sockfd; } - -int recv_fd(int sockfd, int *_fd) -{ - char cms[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg; - struct msghdr msg; - struct iovec iov; - char buf[1]; - int len; - - iov.iov_base = buf; - iov.iov_len = 1; - - msg.msg_name = 0; - msg.msg_namelen = 0; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_flags = 0; - msg.msg_control = (caddr_t)cms; - msg.msg_controllen = sizeof(cms); - - len = recvmsg(sockfd, &msg, 0); - - if (len < 0) { - log_error("Recvmsg failed length incorrect.\n"); - exit(EXIT_FAILURE); - } - - cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - log_error("No ancillary data.\n"); - exit(EXIT_FAILURE); - } - *_fd = *(int *)CMSG_DATA(cmsg); - - return 0; -} diff --git a/lib/flash/uds/flash_uds.h b/lib/flash/uds/flash_uds.h index 9b66546..8d10f45 100644 --- a/lib/flash/uds/flash_uds.h +++ b/lib/flash/uds/flash_uds.h @@ -26,13 +26,83 @@ #define FLASH__GET_IP_ADDR 14 #define FLASH__GET_DST_IP_ADDR 15 -int send_fd(int sockfd, int fd); -int start_uds_server(void); -int start_uds_client(void); -int recv_fd(int sockfd, int *_fd); -void send_cmd(int sockfd, int cmd); -void send_data(int sockfd, void *data, int size); -void recv_data(int sockfd, void *data, int size); -int recv_cmd(int sockfd); +/* UDS Control path APIs*/ + +/** + * Starts a UDS server connection + * + * @return socket file descriptor on success, -1 on failure + */ +int flash__start_uds_server(void); + +/** + * Starts a UDS client connection to the monitor + * + * @return socket file descriptor on success, -1 on failure + */ +int flash__start_uds_client(void); + +/* UDS Data path APIs */ + +/** + * Receive a command in the monitor + * + * @param sockfd The socket file descriptor + * + * @return The command received, or -1 on error + */ +int flash__recv_cmd(int sockfd); + +/** + * Send a command to the monitor + * + * @param sockfd The socket file descriptor + * @param cmd The command to send + * + * @return The number of bytes sent, or -1 on error + */ +int flash__send_cmd(int sockfd, int cmd); + +/** + * Receive data from the monitor + * + * @param sockfd The socket file descriptor + * @param data Pointer to the buffer where the received data will be stored + * @param size Size of the data to receive in bytes + * + * @return The number of bytes received, or -1 on error + */ +int flash__recv_data(int sockfd, void *data, int size); + +/** + * Send data to the monitor + * + * @param sockfd The socket file descriptor + * @param data Pointer to the data to send + * @param size Size of the data in bytes + * + * @return The number of bytes sent, or -1 on error + */ +int flash__send_data(int sockfd, void *data, int size); + +/** + * Receive a file descriptor from the monitor + * + * @param sockfd The socket file descriptor + * @param _fd Pointer to an integer where the received file descriptor will be stored + * + * @return 0 on success, -1 on error + */ +int flash__recv_fd(int sockfd, int *_fd); + +/** + * Send a file descriptor from the monitor + * + * @param sockfd The socket file descriptor + * @param fd The file descriptor to send + * + * @return 0 on success, -1 on error + */ +int flash__send_fd(int sockfd, int fd); #endif /* __FLASH_UDS_H */ diff --git a/lib/include/flash_defines.h b/lib/include/flash_defines.h index 6c5b70f..356c016 100644 --- a/lib/include/flash_defines.h +++ b/lib/include/flash_defines.h @@ -51,7 +51,7 @@ struct umem_config { struct config { const char *app_name; - const char * const *app_options; + const char *const *app_options; int umem_fd; int uds_sockfd; int umem_scale; @@ -199,7 +199,7 @@ struct nf { int id; char ip[INET_ADDRSTRLEN]; uint16_t port; - int *next; + int *next; // To be removed int next_size; struct thread **thread; bool is_up; diff --git a/monitor/main.c b/monitor/main.c index 4bf91cb..c021cec 100644 --- a/monitor/main.c +++ b/monitor/main.c @@ -35,74 +35,73 @@ static void *handle_nf(void *arg) do { log_info("Waiting for command..."); - cmd = recv_cmd(msgsock); + cmd = flash__recv_cmd(msgsock); switch (cmd) { case FLASH__GET_UMEM: - recv_data(msgsock, data, sizeof(struct nf_data)); + flash__recv_data(msgsock, data, sizeof(struct nf_data)); if (configure_umem(data, &umem) == -1) { continue; } - send_fd(msgsock, umem->cfg->umem_fd); - send_data(msgsock, &umem->nf[data->nf_id]->thread_count, sizeof(int)); - send_data(msgsock, &umem->cfg->umem->size, sizeof(int)); - send_data(msgsock, &umem->cfg->umem_scale, sizeof(int)); + flash__send_fd(msgsock, umem->cfg->umem_fd); + flash__send_data(msgsock, &umem->nf[data->nf_id]->thread_count, sizeof(int)); + flash__send_data(msgsock, &umem->cfg->umem->size, sizeof(int)); + flash__send_data(msgsock, &umem->cfg->umem_scale, sizeof(int)); break; case FLASH__CREATE_SOCKET: if (umem != NULL) { struct socket *sock = create_new_socket(umem, data->nf_id); - send_fd(msgsock, sock->fd); - send_data(msgsock, &sock->ifqueue, sizeof(int)); + flash__send_fd(msgsock, sock->fd); + flash__send_data(msgsock, &sock->ifqueue, sizeof(int)); } break; case FLASH__GET_UMEM_OFFSET: int offset = data->nf_id * umem->nf[data->nf_id]->thread_count + umem->nf[data->nf_id]->current_thread_count; - send_data(msgsock, &offset, sizeof(int)); + flash__send_data(msgsock, &offset, sizeof(int)); break; case FLASH__GET_ROUTE_INFO: - send_data(msgsock, &umem->nf[data->nf_id]->next_size, sizeof(int)); - send_data(msgsock, umem->nf[data->nf_id]->next, sizeof(int) * umem->nf[data->nf_id]->next_size); + flash__send_data(msgsock, &umem->nf[data->nf_id]->next_size, sizeof(int)); break; case FLASH__GET_BIND_FLAGS: - send_data(msgsock, &umem->cfg->xsk->bind_flags, sizeof(__u32)); + flash__send_data(msgsock, &umem->cfg->xsk->bind_flags, sizeof(__u32)); break; case FLASH__GET_XDP_FLAGS: - send_data(msgsock, &umem->cfg->xsk->xdp_flags, sizeof(__u32)); + flash__send_data(msgsock, &umem->cfg->xsk->xdp_flags, sizeof(__u32)); break; case FLASH__GET_MODE: - send_data(msgsock, &umem->cfg->xsk->mode, sizeof(__u32)); + flash__send_data(msgsock, &umem->cfg->xsk->mode, sizeof(__u32)); break; case FLASH__GET_POLL_TIMEOUT: - send_data(msgsock, &umem->cfg->xsk->poll_timeout, sizeof(int)); + flash__send_data(msgsock, &umem->cfg->xsk->poll_timeout, sizeof(int)); break; case FLASH__GET_FRAGS_ENABLED: - send_data(msgsock, &umem->cfg->frags_enabled, sizeof(bool)); + flash__send_data(msgsock, &umem->cfg->frags_enabled, sizeof(bool)); break; case FLASH__GET_IFNAME: - send_data(msgsock, &umem->cfg->ifname, IF_NAMESIZE); + flash__send_data(msgsock, &umem->cfg->ifname, IF_NAMESIZE); break; case FLASH__GET_IP_ADDR: - send_data(msgsock, umem->nf[data->nf_id]->ip, INET_ADDRSTRLEN); + flash__send_data(msgsock, umem->nf[data->nf_id]->ip, INET_ADDRSTRLEN); log_info("NF IP: %s", umem->nf[data->nf_id]->ip); break; case FLASH__GET_DST_IP_ADDR: - send_data(msgsock, &umem->nf[data->nf_id]->next_size, sizeof(int)); + flash__send_data(msgsock, &umem->nf[data->nf_id]->next_size, sizeof(int)); log_info("Number of Backends: %d", umem->nf[data->nf_id]->next_size); for (int i = 0; i < umem->nf[data->nf_id]->next_size; i++) { log_info("Sending IP %s", umem->nf[umem->nf[data->nf_id]->next[i]]->ip); log_info("Next NF: %d", umem->nf[data->nf_id]->next[i]); - send_data(msgsock, umem->nf[umem->nf[data->nf_id]->next[i]]->ip, INET_ADDRSTRLEN); + flash__send_data(msgsock, umem->nf[umem->nf[data->nf_id]->next[i]]->ip, INET_ADDRSTRLEN); } break; @@ -128,7 +127,7 @@ static void *worker__uds_server(void *arg) { (void)arg; - unix_socket_server = start_uds_server(); + unix_socket_server = flash__start_uds_server(); struct pollfd fds[1] = {}; fds[0].fd = unix_socket_server; fds[0].events = POLLIN; From 2c7fbd56392c3fc982b7c0aebc52629f9847dd1b Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 18 Jun 2025 07:34:13 +0530 Subject: [PATCH 10/43] feat: new txgen nf - a new flash__allocmsg() data path API which uses the pool library to allocate tx packets - a new txgen nf which transmit packets at max rate. caveats: there are packet copies involved in the implementation. performance decreases with increasing packet sizes. --- examples/meson.build | 1 + examples/txgen/main.c | 359 +++++++++++++++++++++++++++++++++++++ examples/txgen/meson.build | 6 + lib/flash/nf/flash_nf.h | 20 ++- lib/flash/nf/flash_txrx.c | 33 ++++ meson.build | 2 +- 6 files changed, 417 insertions(+), 4 deletions(-) create mode 100644 examples/txgen/main.c create mode 100644 examples/txgen/meson.build diff --git a/examples/meson.build b/examples/meson.build index dc7278d..c6afb34 100644 --- a/examples/meson.build +++ b/examples/meson.build @@ -12,6 +12,7 @@ dirs = [ 'firewall', 'arpresolver', 'mica', + 'txgen' ] def_deps = [include, log, nf, params, uds] diff --git a/examples/txgen/main.c b/examples/txgen/main.c new file mode 100644 index 0000000..5e939d7 --- /dev/null +++ b/examples/txgen/main.c @@ -0,0 +1,359 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025 Debojeet Das + * + * txgen: A packet generator that transmits Ethernet+IPv4+UDP frames with + * configurable addresses and ports. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +bool done = false; +struct config *cfg = NULL; +struct nf *nf = NULL; +uint8_t *packet_template = NULL; + +static void int_exit(int sig) +{ + log_debug("Received Signal: %d", sig); + done = true; +} + +struct appconf { + int cpu_start; + int cpu_end; + int stats_cpu; + uint8_t src_ether_addr_octet[6]; + uint8_t dest_ether_addr_octet[6]; + uint32_t src_ip; + uint32_t dest_ip; + uint16_t src_port; + uint16_t dest_port; + uint16_t payload_len; +} app_conf; + +// clang-format off +static const char *txgen_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tSrc MAC address to use (default: a0:a1:a2:a3:a4:a5)", + "-D \tDest MAC address to use (default: a0:a1:a2:a3:a4:a5)", + "-A \tSrc IPv4 address to use (default: 192.168.1.1)", + "-B \tDest IPv4 address to use (default: 192.168.2.1)", + "-P \tSrc port to use (default: 1234)", + "-Q \tDest port to use (default: 5678)", + "-L \tPayload length (default: 5 bytes)", + NULL +}; +// clang-format on + +static int parse_mac(const char *str, uint8_t *mac) +{ + int vals[6]; + if (sscanf(str, "%x:%x:%x:%x:%x:%x", &vals[0], &vals[1], &vals[2], &vals[3], &vals[4], &vals[5]) != 6) { + log_error("Invalid MAC address: %s", str); + return -1; + } + for (int i = 0; i < 6; i++) + mac[i] = (uint8_t)vals[i]; + + return 0; +} + +static int parse_ip(const char *str, uint32_t *ip) +{ + struct in_addr addr; + if (inet_aton(str, &addr) == 0) { + log_error("Invalid IPv4 address: %s", str); + return -1; + } + *ip = addr.s_addr; + + return 0; +} + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +{ + int c; + opterr = 0; + + app_conf->cpu_start = 0; + app_conf->cpu_end = 0; + app_conf->stats_cpu = 1; + app_conf->src_ip = htonl(0xC0A80101); + app_conf->dest_ip = htonl(0xC0A80201); + app_conf->src_port = htons(1234); + app_conf->dest_port = htons(5678); + app_conf->payload_len = 5; + for (int i = 0; i < 6; i++) { + app_conf->src_ether_addr_octet[i] = 0xA0 + i; + app_conf->dest_ether_addr_octet[i] = 0xA0 + i; + } + + argc -= shift; + argv += shift; + + while ((c = getopt(argc, argv, "hc:e:s:S:D:A:B:P:Q:L:")) != -1) + switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; + case 'c': + app_conf->cpu_start = atoi(optarg); + break; + case 'e': + app_conf->cpu_end = atoi(optarg); + break; + case 's': + app_conf->stats_cpu = atoi(optarg); + break; + case 'S': + if (parse_mac(optarg, app_conf->src_ether_addr_octet) < 0) + return -1; + break; + case 'D': + if (parse_mac(optarg, app_conf->dest_ether_addr_octet) < 0) + return -1; + break; + case 'A': + if (parse_ip(optarg, &app_conf->src_ip) < 0) + return -1; + break; + case 'B': + if (parse_ip(optarg, &app_conf->dest_ip) < 0) + return -1; + break; + case 'P': + app_conf->src_port = htons(atoi(optarg)); + break; + case 'Q': + app_conf->dest_port = htons(atoi(optarg)); + break; + case 'L': + app_conf->payload_len = atoi(optarg); + if (app_conf->payload_len > 1500) { + log_error("Invalid payload length: %d.", app_conf->payload_len); + return -1; + } + break; + default: + printf("Usage: %s -h\n", argv[-shift]); + return -1; + } + + return 0; +} + +static uint16_t csum16(const void *data, size_t len) +{ + const uint16_t *buf = (const uint16_t *)data; + uint32_t sum = 0; + + while (len > 1) { + sum += *buf++; + len -= 2; + } + + if (len) + sum += *(const uint8_t *)buf; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return (uint16_t)(~sum); +} + +static void setup_packet(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct iphdr *ip = (struct iphdr *)(eth + 1); + struct udphdr *udp = (struct udphdr *)(ip + 1); + + memcpy(eth->ether_shost, app_conf.src_ether_addr_octet, ETH_ALEN); + memcpy(eth->ether_dhost, app_conf.dest_ether_addr_octet, ETH_ALEN); + eth->ether_type = htons(ETH_P_IP); + + ip->ihl = 5; + ip->version = 4; + ip->tos = 0; + ip->tot_len = htons(sizeof(struct iphdr) + sizeof(struct udphdr) + app_conf.payload_len); + ip->id = htons(0x1234); + ip->frag_off = 0; + ip->ttl = 64; + ip->protocol = IPPROTO_UDP; + ip->check = 0; + ip->saddr = app_conf.src_ip; + ip->daddr = app_conf.dest_ip; + ip->check = csum16(ip, sizeof(struct iphdr)); + + udp->source = app_conf.src_port; + udp->dest = app_conf.dest_port; + udp->len = htons(sizeof(struct udphdr) + app_conf.payload_len); + udp->check = 0; + + char *payload = (char *)(udp + 1); + memset(payload, 'A', app_conf.payload_len); +} + +struct sock_args { + int socket_id; +}; + +static void *socket_routine(void *arg) +{ + struct socket *xsk; + struct xskvec *xskvecs; + uint32_t i, nalloc, nsend; + struct sock_args *a = (struct sock_args *)arg; + + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; + + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; + } + + size_t packet_size = sizeof(struct ether_header) + sizeof(struct iphdr) + sizeof(struct udphdr) + app_conf.payload_len; + + for (;;) { + nalloc = flash__allocmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + for (i = 0; i < nalloc; i++) { + memcpy(xskvecs[i].data, packet_template, packet_size); + xskvecs[i].len = packet_size; + xskvecs[i].options = 0; + } + + if (nalloc) { + nsend = flash__sendmsg(cfg, xsk, xskvecs, nalloc); + if (nsend != nalloc) { + log_error("errno: %d/\"%s\"", errno, strerror(errno)); + break; + } + } + + if (done) + break; + } + + free(xskvecs); + return NULL; +} + +int main(int argc, char **argv) +{ + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; + cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + size_t packet_size; + + cfg = calloc(1, sizeof(struct config)); + if (!cfg) { + log_error("ERROR: Memory allocation failed"); + exit(EXIT_FAILURE); + } + + cfg->app_name = "Traffic Generation Application"; + cfg->app_options = txgen_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + packet_size = sizeof(struct ether_header) + sizeof(struct iphdr) + sizeof(struct udphdr) + app_conf.payload_len; + log_debug("Packet size: %zu bytes", packet_size); + + packet_template = (uint8_t *)calloc(1, packet_size); + if (!packet_template) { + log_error("ERROR: Memory allocation failed for packet template"); + goto out_cfg; + } + + setup_packet(packet_template); + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_pkt; + + log_info("Control Plane setup done..."); + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + signal(SIGABRT, int_exit); + + log_info("Starting Data Path..."); + + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_pkt; + } + + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; + + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { + log_error("Error creating socket thread"); + goto out_args; + } + + CPU_ZERO(&cpuset); + CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); + if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; + } + + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + } + + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { + log_error("Error creating statistics thread"); + goto out_args; + } + CPU_ZERO(&cpuset); + CPU_SET(app_conf.stats_cpu, &cpuset); + if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { + log_error("ERROR: Unable to set thread affinity: %s", strerror(errno)); + goto out_args; + } + + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + + flash__wait(cfg); + flash__xsk_close(cfg, nf); + + exit(EXIT_SUCCESS); + +out_args: + free(args); +out_pkt: + free(packet_template); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); +} diff --git a/examples/txgen/meson.build b/examples/txgen/meson.build new file mode 100644 index 0000000..d90acbe --- /dev/null +++ b/examples/txgen/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Debojeet Das + +sources = files('main.c') + +executable('txgen', sources, c_args: cflags, install: true, dependencies: deps) \ No newline at end of file diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index 5fafeac..da121bb 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -30,7 +30,7 @@ struct stats_conf { extern bool done; -/* Control Path API */ +/* Control Path APIs */ /** * Configure the NF with the provided configuration. @@ -66,7 +66,7 @@ void flash__wait(struct config *cfg); */ void flash__xsk_close(struct config *cfg, struct nf *nf); -/* Data Path API */ +/* Data Path APIs */ /** * Poll the NF for incoming packets. @@ -117,11 +117,25 @@ size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk */ size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t ndrop); +/** + * Allocate memory for messages to be sent. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param xskvecs: Pointer to the array of xskvec structures to allocate memory for. + * @param nalloc: Number of messages to allocate memory for. + * + * @return Number of messages allocated, or 0 on failure. + */ +size_t flash__allocmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nalloc); + int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout); size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg); size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend); size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop); +/* Helper APIs */ + /** * Thread function to periodically dump statistics of the NF. * @@ -132,7 +146,7 @@ size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec * */ void *flash__stats_thread(void *conf); -/* Advanced API */ +/* Advanced APIs */ void flash__populate_fill_ring(struct thread **thread, int frame_size, int total_sockets, int umem_offset, int umem_scale); diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 314a817..9f4e0bd 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -648,3 +648,36 @@ size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk #endif return ndrop; } + +size_t flash__allocmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nalloc) +{ + uint32_t i; + uint64_t addr; + + if (!nalloc || !xsk || !xskvecs || !cfg || !xsk->flash_pool) + return 0; + + if (cfg->rx_first) { + log_error("Cannot allocate xskvecs in rx_first mode"); + return 0; + } else { + for (i = 0; i < nalloc; i++) { + while (!flash_pool__get(xsk->flash_pool, &addr)) { + __complete_tx_completions(cfg, xsk); + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { +#ifdef STATS + xsk->app_stats.tx_wakeup_sendtos++; +#endif + __kick_tx(xsk); + } + } + + xskvecs[i].data = xsk_umem__get_data(cfg->umem->buffer, addr); + xskvecs[i].addr = addr; + xskvecs[i].len = cfg->umem->frame_size; + xskvecs[i].options = 0; + } + } + + return nalloc; +} diff --git a/meson.build b/meson.build index e511e7c..35f38ba 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'flash', 'C', version: '25.5-beta', - license: 'BSD', + license: 'Apache-2.0', default_options: [ 'buildtype=release', 'default_library=shared', From ddce2dd8bec71122d6ce6f96d7df06db644f5e04 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Tue, 24 Jun 2025 18:20:47 +0530 Subject: [PATCH 11/43] feat: added flash_helpers - add MAC address retrieval function and integrate into existing components. - bug fixes in l2fwd, simplefwd, fwdrr, and fwddrop --- examples/l2fwd/main.c | 6 +++- examples/simplefwd/main.c | 6 +++- examples/txgen/main.c | 66 ++++++++++++++++++++++++++--------- examples/unit-tests/fwddrop.c | 16 +++++++-- examples/unit-tests/fwdrr.c | 6 +++- lib/flash/nf/flash_helpers.c | 44 +++++++++++++++++++++++ lib/flash/nf/flash_nf.h | 10 ++++++ lib/flash/nf/meson.build | 2 +- 8 files changed, 134 insertions(+), 22 deletions(-) create mode 100644 lib/flash/nf/flash_helpers.c diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index 30178a1..e13159f 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -211,7 +211,7 @@ int main(int argc, char **argv) args = calloc(cfg->total_sockets, sizeof(struct sock_args)); if (!args) { log_error("ERROR: Memory allocation failed for sock_args"); - goto out_cfg; + goto out_cfg_close; } for (int i = 0; i < cfg->total_sockets; i++) { @@ -260,7 +260,11 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); out_args: + done = true; free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); out_cfg: free(cfg); exit(EXIT_FAILURE); diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index 484eb15..c68b35e 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -171,7 +171,7 @@ int main(int argc, char **argv) args = calloc(cfg->total_sockets, sizeof(struct sock_args)); if (!args) { log_error("ERROR: Memory allocation failed for sock_args"); - goto out_cfg; + goto out_cfg_close; } for (int i = 0; i < cfg->total_sockets; i++) { @@ -220,7 +220,11 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); out_args: + done = true; free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); out_cfg: free(cfg); exit(EXIT_FAILURE); diff --git a/examples/txgen/main.c b/examples/txgen/main.c index 5e939d7..b029a72 100644 --- a/examples/txgen/main.c +++ b/examples/txgen/main.c @@ -32,6 +32,8 @@ struct appconf { int cpu_start; int cpu_end; int stats_cpu; + bool custom_src_ether_addr; + bool custom_dest_ether_addr; uint8_t src_ether_addr_octet[6]; uint8_t dest_ether_addr_octet[6]; uint32_t src_ip; @@ -46,8 +48,8 @@ static const char *txgen_options[] = { "-c \tStart CPU (default: 0)", "-e \tEnd CPU (default: 0)", "-s \tStats CPU (default: 1)", - "-S \tSrc MAC address to use (default: a0:a1:a2:a3:a4:a5)", - "-D \tDest MAC address to use (default: a0:a1:a2:a3:a4:a5)", + "-S \tSrc MAC address to use (default: NIC MAC address)", + "-D \tDest MAC address to use (default: NIC MAC address)", "-A \tSrc IPv4 address to use (default: 192.168.1.1)", "-B \tDest IPv4 address to use (default: 192.168.2.1)", "-P \tSrc port to use (default: 1234)", @@ -95,10 +97,8 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s app_conf->src_port = htons(1234); app_conf->dest_port = htons(5678); app_conf->payload_len = 5; - for (int i = 0; i < 6; i++) { - app_conf->src_ether_addr_octet[i] = 0xA0 + i; - app_conf->dest_ether_addr_octet[i] = 0xA0 + i; - } + app_conf->custom_src_ether_addr = false; + app_conf->custom_dest_ether_addr = false; argc -= shift; argv += shift; @@ -120,10 +120,12 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s case 'S': if (parse_mac(optarg, app_conf->src_ether_addr_octet) < 0) return -1; + app_conf->custom_src_ether_addr = true; break; case 'D': if (parse_mac(optarg, app_conf->dest_ether_addr_octet) < 0) return -1; + app_conf->custom_dest_ether_addr = true; break; case 'A': if (parse_ip(optarg, &app_conf->src_ip) < 0) @@ -173,14 +175,32 @@ static uint16_t csum16(const void *data, size_t len) return (uint16_t)(~sum); } -static void setup_packet(void *data) +static int setup_packet(void *data) { struct ether_header *eth = (struct ether_header *)data; struct iphdr *ip = (struct iphdr *)(eth + 1); struct udphdr *udp = (struct udphdr *)(ip + 1); + struct ether_addr tmp_addr; - memcpy(eth->ether_shost, app_conf.src_ether_addr_octet, ETH_ALEN); - memcpy(eth->ether_dhost, app_conf.dest_ether_addr_octet, ETH_ALEN); + if (app_conf.custom_src_ether_addr) + memcpy(eth->ether_shost, app_conf.src_ether_addr_octet, ETH_ALEN); + else { + if (flash__get_macaddr(cfg, &tmp_addr) < 0) { + log_error("Failed to get source MAC address"); + return -1; + } + memcpy(eth->ether_shost, tmp_addr.ether_addr_octet, ETH_ALEN); + } + + if (app_conf.custom_dest_ether_addr) + memcpy(eth->ether_dhost, app_conf.dest_ether_addr_octet, ETH_ALEN); + else { + if (flash__get_macaddr(cfg, &tmp_addr) < 0) { + log_error("Failed to get destination MAC address"); + return -1; + } + memcpy(eth->ether_dhost, tmp_addr.ether_addr_octet, ETH_ALEN); + } eth->ether_type = htons(ETH_P_IP); ip->ihl = 5; @@ -203,6 +223,8 @@ static void setup_packet(void *data) char *payload = (char *)(udp + 1); memset(payload, 'A', app_conf.payload_len); + + return 0; } struct sock_args { @@ -276,21 +298,29 @@ int main(int argc, char **argv) if (parse_app_args(argc, argv, &app_conf, shift) < 0) goto out_cfg; + if (cfg->rx_first) { + log_error("ERROR: tx_first should be enabled in txgen"); + goto out_cfg; + } + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); + packet_size = sizeof(struct ether_header) + sizeof(struct iphdr) + sizeof(struct udphdr) + app_conf.payload_len; log_debug("Packet size: %zu bytes", packet_size); packet_template = (uint8_t *)calloc(1, packet_size); if (!packet_template) { log_error("ERROR: Memory allocation failed for packet template"); - goto out_cfg; + goto out_cfg_close; } - setup_packet(packet_template); - - if (flash__configure_nf(&nf, cfg) < 0) + if (setup_packet(packet_template) < 0) { + log_error("ERROR: Failed to setup packet template"); goto out_pkt; - - log_info("Control Plane setup done..."); + } signal(SIGINT, int_exit); signal(SIGTERM, int_exit); @@ -350,9 +380,13 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); out_args: + done = true; free(args); out_pkt: - free(packet_template); + free(packet_template); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); out_cfg: free(cfg); exit(EXIT_FAILURE); diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index 0c43725..2cb44ba 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -51,11 +51,12 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s app_conf->cpu_end = 0; app_conf->stats_cpu = 1; app_conf->sriov = false; + app_conf->fwd_ratio = 50; argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:r:S:")) != -1) switch (c) { case 'h': printf("Usage: %s -h\n", argv[-shift]); @@ -69,6 +70,13 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s case 's': app_conf->stats_cpu = atoi(optarg); break; + case 'r': + app_conf->fwd_ratio = atoi(optarg); + if (app_conf->fwd_ratio < 0 || app_conf->fwd_ratio > 100) { + log_error("Invalid forward ratio: %d. Must be between 0 and 100.", app_conf->fwd_ratio); + return -1; + } + break; case 'S': if (sscanf(optarg, "%x:%x:%x:%x:%x:%x", ðaddr[0], ðaddr[1], ðaddr[2], ðaddr[3], ðaddr[4], ðaddr[5]) != 6) { @@ -237,7 +245,7 @@ int main(int argc, char **argv) args = calloc(cfg->total_sockets, sizeof(struct sock_args)); if (!args) { log_error("ERROR: Memory allocation failed for sock_args"); - goto out_cfg; + goto out_cfg_close; } for (int i = 0; i < cfg->total_sockets; i++) { @@ -286,7 +294,11 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); out_args: + done = true; free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); out_cfg: free(cfg); exit(EXIT_FAILURE); diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index e05461c..c68e212 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -218,7 +218,7 @@ int main(int argc, char **argv) args = calloc(cfg->total_sockets, sizeof(struct sock_args)); if (!args) { log_error("ERROR: Memory allocation failed for sock_args"); - goto out_cfg; + goto out_cfg_close; } for (int i = 0; i < cfg->total_sockets; i++) { @@ -270,7 +270,11 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); out_args: + done = true; free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); out_cfg: free(cfg); exit(EXIT_FAILURE); diff --git a/lib/flash/nf/flash_helpers.c b/lib/flash/nf/flash_helpers.c new file mode 100644 index 0000000..97901ec --- /dev/null +++ b/lib/flash/nf/flash_helpers.c @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025 Debojeet Das + */ + +#include +#include +#include +#include + +#include "flash_nf.h" + +int flash__get_macaddr(struct config *cfg, struct ether_addr *addr) +{ + int fd; + struct ifreq ifr; + + if (!cfg || !addr) { + log_error("Invalid configuration or interface name"); + return -1; + } + + fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (fd < 0) { + log_error("Unable to open socket for ioctl"); + return -1; + } + + memset(&ifr, 0, sizeof(ifr)); + log_info("Getting MAC address for interface: %s", cfg->ifname); + + strncpy(ifr.ifr_name, cfg->ifname, IF_NAMESIZE); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + + if (ioctl(fd, SIOCGIFHWADDR, &ifr) == -1) { + log_error("Unable to get MAC address"); + close(fd); + return -1; + } + + memcpy(addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); + + close(fd); + return 0; +} \ No newline at end of file diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index da121bb..35d6fc3 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -29,6 +29,7 @@ struct stats_conf { }; extern bool done; +struct ether_addr; /* Control Path APIs */ @@ -146,6 +147,15 @@ size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec * */ void *flash__stats_thread(void *conf); +/** + * Get the MAC address of the interface specified in the configuration. + * @param cfg: Pointer to the configuration structure. + * @param addr: Pointer to the ether_addr structure to store the MAC address. + * + * @return 0 on success, or -1 on failure. + */ +int flash__get_macaddr(struct config *cfg, struct ether_addr *addr); + /* Advanced APIs */ void flash__populate_fill_ring(struct thread **thread, int frame_size, int total_sockets, int umem_offset, int umem_scale); diff --git a/lib/flash/nf/meson.build b/lib/flash/nf/meson.build index 684af69..cde04af 100644 --- a/lib/flash/nf/meson.build +++ b/lib/flash/nf/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2025 Debojeet Das -sources = files('flash_nf.c', 'flash_stats.c', 'flash_txrx.c', 'flash_txrx_dev.c') +sources = files('flash_nf.c', 'flash_stats.c', 'flash_txrx.c', 'flash_txrx_dev.c', 'flash_helpers.c') headers = files('flash_nf.h') deps += [uds, common, pool] From 491d293ee47c9f5b5da97a1fcc24ad1dd3b2e503 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 30 Jun 2025 16:51:54 +0530 Subject: [PATCH 12/43] fix: updated arpresolver, firewall, ip4ping, unit-test/correctness --- examples/arpresolver/main.c | 285 +++++++++--------- examples/firewall/main.c | 222 ++++++++------ examples/ip4ping/main.c | 204 +++++++------ examples/unit-tests/correctness.c | 473 ++++++++++++++---------------- 4 files changed, 607 insertions(+), 577 deletions(-) diff --git a/examples/arpresolver/main.c b/examples/arpresolver/main.c index 445fe3e..e6bf2ea 100644 --- a/examples/arpresolver/main.c +++ b/examples/arpresolver/main.c @@ -43,6 +43,16 @@ struct appconf { uint8_t *dest_ether_addr_octet; } app_conf; +// clang-format off +static const char *arpresolver_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tEnable SR-IOV mode and set destination MAC address", + NULL +}; +// clang-format on + static int hex2int(char ch) { if (ch >= '0' && ch <= '9') @@ -66,7 +76,7 @@ static uint8_t *get_mac_addr(char *mac_addr) return dest_ether_addr_octet; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -80,8 +90,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -96,8 +109,10 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->sriov = true; break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + return 0; } static void update_dest_mac(void *data) @@ -129,69 +144,12 @@ static void swap_mac_addresses(void *data) *dst_addr = tmp; } -static int get_mac_address(void) -{ - int fd; - struct ifreq ifr; - - // Open a socket - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd == -1) { - perror("socket"); - return -1; - } - - // Copy interface name into ifreq structure - strncpy(ifr.ifr_name, cfg->ifname, IF_NAMESIZE); - ifr.ifr_name[IFNAMSIZ - 1] = '\0'; - - // Perform IOCTL to get MAC address - if (ioctl(fd, SIOCGIFHWADDR, &ifr) == -1) { - perror("ioctl"); - close(fd); - return -1; - } - - close(fd); - - // Copy MAC address to src_mac array - memcpy(src_mac, ifr.ifr_hwaddr.sa_data, ETH_ALEN); - - return 0; // Success -} - -static void configure(void) -{ - // Need to change so that we get IPS of all NFS, not just of our local dest - // send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - // flash__recv_data(cfg->uds_sockfd, &num_valid_ips, sizeof(int)); - // if (num_valid_ips != 1){ - // printf("Arp-resolver should be ran along with ip4ping only"); - // exit(1); - // } - // log_info("Number of Backends: %d", num_valid_ips); - // flash__recv_data(cfg->uds_sockfd, ip4ping_ip, INET_ADDRSTRLEN); - // log_info("ip4ping_ip: %s", ip4ping_ip); - - // configuring src_mac - if (get_mac_address() < 0) { - printf("Error in ioctl: fetch mac address\n"); - exit(1); - } -} - static void int_exit(int sig) { log_info("Received Signal: %d", sig); done = true; } -struct Args { - int socket_id; - int *next; - int next_size; -}; - // handling IP4 struct __attribute__((packed)) arp_header { unsigned short arp_hd; @@ -205,88 +163,81 @@ struct __attribute__((packed)) arp_header { unsigned char arp_dpa[4]; }; -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } - return NULL; -} +struct sock_args { + int socket_id; +}; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct pollfd fds[1] = {}; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + uint32_t i, nrecv, wsend, nsend, wdrop, ndrop; + struct sock_args *a = (struct sock_args *)arg; + int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - // free(arg); + log_info("SOCKET_ID: %d", socket_id); - // static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; - struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + xsk = nf->thread[socket_id]->socket; - log_info("2_NEXT_SIZE: %d", next_size); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; + } - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("ERROR: Memory allocation failed for sendvecs"); + free(xskvecs); + return NULL; } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("ERROR: Memory allocation failed for dropvecs"); + free(xskvecs); + free(sendvecs); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - struct xskvec *drop[nrecv]; - unsigned int tot_pkt_drop = 0; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + + wsend = 0; + wdrop = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - void *pkt = xv->data; - void *pkt_end = pkt + xv->len; + void *pkt = xskvecs[i].data; + + void *pkt_end = pkt + xskvecs[i].len; uint8_t tmp_mac[ETH_ALEN]; unsigned char buff_ip[4]; struct ethhdr *eth = pkt; if ((void *)(eth + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } struct arp_header *arp = (struct arp_header *)(eth + 1); if ((void *)(arp + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } if (ntohs(eth->h_proto) != ETH_P_ARP || (ntohs(arp->arp_op) != ARPOP_REQUEST)) { - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; if (app_conf.sriov) { swap_mac_addresses(pkt); update_dest_mac(pkt); @@ -299,12 +250,7 @@ static void *socket_routine(void *arg) char query_ip[IP_STRLEN]; inet_ntop(AF_INET, (struct in_addr *)buff_ip, query_ip, sizeof(query_ip)); - // if (strcmp(ip4ping_ip, query_ip) == 0){ - // goto send_arp_resp; - // } - - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - continue; + dropvecs[wdrop++] = xskvecs[i]; // send_arp_resp: memcpy(tmp_mac, eth->h_dest, ETH_ALEN); memcpy(eth->h_dest, eth->h_source, ETH_ALEN); @@ -319,41 +265,64 @@ static void *socket_routine(void *arg) memcpy(arp->arp_dha, arp->arp_sha, ETH_ALEN); memcpy(arp->arp_sha, src_mac, ETH_ALEN); - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (nsend != wsend || ndrop != wdrop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(sendvecs); + free(dropvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "ARP Resolver"; + cfg->app_options = arpresolver_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; log_info("Control Plane Setup Done"); - configure(); + + struct ether_addr tmp_addr; + if (flash__get_macaddr(cfg, &tmp_addr) < 0) { + log_error("ERROR: Unable to get MAC address for interface %s", cfg->ifname); + goto out_cfg; + } + + memcpy(src_mac, tmp_addr.ether_addr_octet, ETH_ALEN); + // Parse JSON signal(SIGINT, int_exit); signal(SIGTERM, int_exit); @@ -361,49 +330,65 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); - for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; - - log_info("2_NEXT_SIZE: %d", args->next_size); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)){ log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(stats_thread); - flash__wait(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } \ No newline at end of file diff --git a/examples/firewall/main.c b/examples/firewall/main.c index c9f6d1f..92c01ee 100644 --- a/examples/firewall/main.c +++ b/examples/firewall/main.c @@ -41,11 +41,14 @@ struct appconf { int stats_cpu; } app_conf; -struct Args { - int socket_id; - int *next; - int next_size; +// clang-format off +static const char *firewall_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + NULL }; +// clang-format on struct session_id { uint32_t saddr; @@ -61,7 +64,7 @@ static void *configure(void) { // Initialise invalid_sessions with random numbers srand(time(NULL)); // Seed only once before generating any random numbers - + for (int i = 0; i < NUM_INVALID_SESSIONS; i++) { int r = rand(); // Different value each iteration invalid_sessions[i] = r; @@ -70,7 +73,7 @@ static void *configure(void) return NULL; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -83,8 +86,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -94,89 +100,86 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int case 's': app_conf->stats_cpu = atoi(optarg); break; - default: - abort(); + default: + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + return 0; } -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } - return NULL; -} +struct sock_args { + int socket_id; +}; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - // free(arg); - log_info("SOCKET_ID: %d", socket_id); - // static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + int ret; + nfds_t nfds = 1; + struct socket *xsk; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + uint32_t i, nrecv, wsend, nsend, wdrop, ndrop; + struct sock_args *a = (struct sock_args *)arg; + + int socket_id = a->socket_id; - log_info("2_NEXT_SIZE: %d", next_size); + log_info("SOCKET_ID: %d", socket_id); - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + xsk = nf->thread[socket_id]->socket; + + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("ERROR: Memory allocation failed for sendvecs"); + free(xskvecs); + return NULL; + } + + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("ERROR: Memory allocation failed for dropvecs"); + free(xskvecs); + free(sendvecs); + return NULL; + } fds[0].fd = nf->thread[socket_id]->socket->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - - struct xskvec *drop[nrecv]; - unsigned int tot_pkt_drop = 0; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + wsend = 0; + wdrop = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; + struct xskvec *xv = &xskvecs[i]; + void *pkt = xv->data; void *pkt_end = pkt + xv->len; + struct ethhdr *eth = pkt; if ((void *)(eth + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } if (eth->h_proto != htons(ETH_P_IP)) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } struct iphdr *iph = (void *)(eth + 1); if ((void *)(iph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } @@ -188,7 +191,7 @@ static void *socket_routine(void *arg) case IPPROTO_TCP:; struct tcphdr *tcph = next; if ((void *)(tcph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } @@ -200,7 +203,7 @@ static void *socket_routine(void *arg) case IPPROTO_UDP:; struct udphdr *udph = next; if ((void *)(udph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } @@ -210,7 +213,7 @@ static void *socket_routine(void *arg) break; default: - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } @@ -226,44 +229,59 @@ static void *socket_routine(void *arg) bool invalid = false; for (int i = 0; i < NUM_INVALID_SESSIONS; i++) { if (invalid_sessions[i] == sid_hash) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; invalid = true; break; } } if (!invalid) - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (nsend != wsend || ndrop != wdrop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(sendvecs); + free(dropvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Firewall Application"; + cfg->app_options = firewall_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; log_info("Control Plane Setup Done"); @@ -274,49 +292,63 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; - - log_info("2_NEXT_SIZE: %d", args->next_size); - - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(stats_thread); - flash__wait(cfg); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } + flash__wait(cfg); flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } \ No newline at end of file diff --git a/examples/ip4ping/main.c b/examples/ip4ping/main.c index 413ed86..f4a6c04 100644 --- a/examples/ip4ping/main.c +++ b/examples/ip4ping/main.c @@ -35,6 +35,16 @@ struct appconf { uint8_t *dest_ether_addr_octet; } app_conf; +// clang-format off +static const char *ip4ping_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tEnable SR-IOV mode and set dest MAC address", + NULL +}; +// clang-format on + static int hex2int(char ch) { if (ch >= '0' && ch <= '9') @@ -58,7 +68,7 @@ static uint8_t *get_mac_addr(char *mac_addr) return dest_ether_addr_octet; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -72,8 +82,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:S:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -88,8 +101,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->sriov = true; break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + + return 0; } static void update_dest_mac(void *data) @@ -139,44 +155,61 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } -struct Args { +struct sock_args { int socket_id; - int *next; - int next_size; }; unsigned int count = 1; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + struct pollfd fds[1] = {}; + uint32_t i, nrecv, nsend, wsend, wdrop, ndrop; + struct sock_args *a = (struct sock_args *)arg; + int socket_id = a->socket_id; log_info("SOCKET_ID: %d", socket_id); - int i, ret, nfds = 1, nrecv; - struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + xsk = nf->thread[socket_id]->socket; + + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; + } + + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("Failed to allocate sendvecs array"); + free(xskvecs); + return NULL; + } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("Failed to allocate dropvecs array"); + free(xskvecs); + free(sendvecs); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - - unsigned int tot_pkt_drop = 0; - unsigned int tot_pkt_send = 0; - struct xskvec *drop[nrecv]; - struct xskvec *send[nrecv]; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + wsend = 0; + wdrop = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; + struct xskvec *xv = &xskvecs[i]; void *data = xv->data; uint32_t len = xv->len; @@ -185,22 +218,22 @@ static void *socket_routine(void *arg) struct in_addr tmp_ip; struct ethhdr *eth = (struct ethhdr *)data; if ((void *)(eth + 1) > data_end) - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; if (ntohs(eth->h_proto) != ETH_P_IP) - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; struct iphdr *ip = (struct iphdr *)(eth + 1); if ((void *)(ip + 1) > data_end) - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; struct icmphdr *icmp = (struct icmphdr *)(ip + 1); if ((void *)(icmp + 1) > data_end) - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; if (ntohs(eth->h_proto) != ETH_P_IP || len < (sizeof(*eth) + sizeof(*ip) + sizeof(*icmp)) || - ip->protocol != IPPROTO_ICMP || icmp->type != ICMP_ECHO) { - send[tot_pkt_send++] = &msg.msg_iov[i]; + ip->protocol != IPPROTO_ICMP || icmp->type != ICMP_ECHO) { + sendvecs[wsend++] = xskvecs[i]; if (app_conf.sriov) { swap_mac_addresses(data); @@ -224,9 +257,10 @@ static void *socket_routine(void *arg) } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + + if (nsend != wsend || ndrop != wdrop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); } @@ -235,46 +269,38 @@ static void *socket_routine(void *arg) if (done) break; } - free(msg.msg_iov); - return NULL; -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } + free(xskvecs); + free(sendvecs); + free(dropvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "IP4 Ping Application"; + cfg->app_options = ip4ping_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; log_info("Control Plane Setup Done"); @@ -284,49 +310,61 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; - - log_info("2_NEXT_SIZE: %d", args->next_size); - - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } + CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } } + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(stats_thread); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } flash__wait(cfg); - flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/examples/unit-tests/correctness.c b/examples/unit-tests/correctness.c index 08727d8..3fe8e38 100644 --- a/examples/unit-tests/correctness.c +++ b/examples/unit-tests/correctness.c @@ -23,9 +23,8 @@ bool done = false; struct config *cfg = NULL; -struct nf *nf; +struct nf *nf = NULL; struct test_stats *stats_arr; -// bool *bool_array; static void int_exit(int sig) { @@ -34,21 +33,27 @@ static void int_exit(int sig) } struct testHeader { - __u8 lastHop; - __u8 hopCount; - __u64 pktId; - __u16 old_dst; + uint8_t lastHop; + uint8_t hopCount; + uint64_t pktId; + uint16_t old_dst; + int sender_nf_id; + int sender_next_size; }; +#define MAX_NFS 16 +struct nf_info { + int sender_next_size; + bool first_packet_received; + uint64_t expected_mod_value; + uint64_t next_expected_pkt_id; +} nf_info_arr[MAX_NFS] = {0}; + struct test_stats { - __u64 pkt_count; - __u64 even_next; // Next expected even packet ID - __u64 odd_next; // Next expected odd packet ID - __u64 pkt_dropped; - __u64 pkt_corrupted; - __u64 pkt_correct; - __u64 even; - __u64 odd; + uint64_t pkt_count; + uint64_t pkt_dropped; + uint64_t pkt_corrupted; + uint64_t pkt_correct; }; struct appconf { @@ -58,7 +63,15 @@ struct appconf { int hops; } app_conf; -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static const char *correctness_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-h \tNumber of hops (default: 1)", + NULL +}; + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -67,6 +80,7 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; + app_conf->hops = 1; argc -= shift; argv += shift; @@ -86,27 +100,10 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->hops = atoi(optarg); break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } -} - -static void __hex_dump(void *pkt, size_t length) -{ - const unsigned char *address = (unsigned char *)pkt; - size_t line_size = 32; - int i = 0; - - while (length-- > 0) { - printf("%02X ", *address++); - if (!(++i % line_size) || (length == 0 && i % line_size)) { - if (length == 0) { - while (i++ % line_size) - printf("__ "); - } - printf("\n"); - } - } - printf("\n"); + return 0; } static void process_packets(void *data, __u32 *len, struct test_stats *stats) @@ -114,12 +111,6 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) void *pos = data; void *data_end = data + *len; - // if (*before1_1 < 2) { - // printf("before: %lld, len %d\n", stats->pkt_count, *len); - // __hex_dump(data, *len); - // *before1_1 = *before1_1 + 1; - // } - struct ethhdr *eth = (struct ethhdr *)pos; if ((void *)(eth + 1) > data_end) { stats->pkt_dropped++; @@ -173,241 +164,198 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) payload_len = ntohs(udphdr->len) - sizeof(struct udphdr); size_t testHeaderLen = sizeof(struct testHeader); + void *payload_end = pos + payload_len; + + struct testHeader *testHeader = NULL; /* First NF */ if (ntohs(udphdr->dest) != TEST_PORT) { - // Shift the data to add the test header. Can we do this without memmove?? - memmove(pos + testHeaderLen, pos, payload_len); - - // Add test header and update the old length - struct testHeader *testHeader = pos; - testHeader->lastHop = app_conf.hops; - testHeader->hopCount = 1; - testHeader->pktId = stats->pkt_count++; - *len += testHeaderLen; - - // update the udp header - testHeader->old_dst = udphdr->dest; - udphdr->dest = htons(TEST_PORT); - udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); - - // update the ip payload length - iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); + // Append test header at the end of the UDP payload + testHeader = (struct testHeader *)payload_end; + testHeader->lastHop = app_conf.hops; + testHeader->hopCount = 1; + testHeader->old_dst = udphdr->dest; + + *len += testHeaderLen; + udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); + iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); + + udphdr->dest = htons(TEST_PORT); + + stats->pkt_correct++; } else { - struct testHeader *testHeader = pos; + // testHeader is at the end of the UDP payload + testHeader = (struct testHeader *)(payload_end - testHeaderLen); testHeader->hopCount++; - // Verify if the pktId is equal to the pkt_count++ and update the pkt_count - // if (testHeader->pktId != stats->pkt_count) { - // if (testHeader->pktId < stats->pkt_count) { - // stats->pkt_corrupted++; - // stats->pkt_count = testHeader->pktId + 1; - // } else { - // stats->pkt_corrupted++; - // stats->pkt_count = testHeader->pktId + 1; - // } - // } else { - // stats->pkt_count = testHeader->pktId + 1; - // stats->pkt_correct++; - // } - - // if (bool_array[testHeader->pktId]) { - // stats->pkt_corrupted++; - // } else { - // bool_array[testHeader->pktId] = true; - // stats->pkt_correct++; - // if (testHeader->pktId % 2 == 0) { - // stats->even++; - // } else { - // stats->odd++; - // } - // } - - if (testHeader->pktId % 2 == 0) { // Even packet - if (testHeader->pktId != stats->even_next) { - __hex_dump(data, *len); - if (testHeader->pktId < stats->even_next) { - stats->pkt_corrupted++; - stats->even_next = testHeader->pktId + 2; - } else { - stats->pkt_corrupted++; - stats->even_next = testHeader->pktId + 2; - } - } else { - stats->even++; - stats->pkt_correct++; - stats->even_next += 2; + uint64_t received_pktId = testHeader->pktId; + int sender_nf_id = testHeader->sender_nf_id; + int sender_next_size = testHeader->sender_next_size; + + if (sender_nf_id < 0 || sender_nf_id >= MAX_NFS) { + log_error("ERROR: Invalid sender NF ID %d", sender_nf_id); + stats->pkt_corrupted++; + goto test_header_update; + } + if (sender_next_size <= 0) { + log_error("ERROR: Invalid sender next size %d", sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; + } + + struct nf_info *sender_info = &nf_info_arr[sender_nf_id]; + + if (!sender_info->first_packet_received) { + sender_info->first_packet_received = true; + sender_info->sender_next_size = sender_next_size; + sender_info->expected_mod_value = received_pktId % sender_next_size; + sender_info->next_expected_pkt_id = received_pktId + sender_next_size; + stats->pkt_correct++; // first packet is always correct + } else { + if (sender_next_size != sender_info->sender_next_size) { + log_error("ERROR: nf_next_size mismatch for NF ID %d: expected %d, got %d", + sender_nf_id, + sender_info->sender_next_size, + sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; + } + + if (received_pktId % sender_next_size != sender_info->expected_mod_value) { + log_error("ERROR: pktId %% sender_next_size mismatch for NF ID %d: expected %lu, got %lu", + sender_nf_id, + sender_info->expected_mod_value, + received_pktId % sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; } - } else { // Odd packet - if (testHeader->pktId != stats->odd_next) { - if (testHeader->pktId < stats->odd_next) { - __hex_dump(data, *len); + + uint64_t next_expected_pkt_id = sender_info->next_expected_pkt_id; + + if(received_pktId != next_expected_pkt_id) { + if (received_pktId < next_expected_pkt_id) { stats->pkt_corrupted++; - stats->odd_next = testHeader->pktId + 2; } else { - stats->pkt_corrupted++; - stats->odd_next = testHeader->pktId + 2; + sender_info->next_expected_pkt_id = received_pktId + sender_next_size; + stats->pkt_dropped += (received_pktId - next_expected_pkt_id) / sender_next_size; } } else { - stats->odd++; + sender_info->next_expected_pkt_id += sender_next_size; stats->pkt_correct++; - stats->odd_next += 2; } } - if (testHeader->lastHop == testHeader->hopCount) { - uint8_t tmp_mac[ETH_ALEN]; - struct in_addr tmp_ip; - unsigned short tmp_port; - payload_len -= testHeaderLen; - - tmp_port = testHeader->old_dst; - - // Shift the data to remove the test header - memmove(pos, pos + testHeaderLen, payload_len); - - // update the udp header - udphdr->dest = tmp_port; - udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); - *len -= testHeaderLen; - - tmp_port = udphdr->dest; - udphdr->dest = udphdr->source; - udphdr->source = tmp_port; - - // update the ip payload length - iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); + } - memcpy(tmp_mac, eth->h_dest, ETH_ALEN); - memcpy(eth->h_dest, eth->h_source, ETH_ALEN); - memcpy(eth->h_source, tmp_mac, ETH_ALEN); - memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); - memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); - memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); - } +test_header_update: + testHeader->pktId = stats->pkt_count++; + testHeader->sender_nf_id = cfg->nf_id; + testHeader->sender_next_size = nf->next_size; + + if (testHeader->lastHop == testHeader->hopCount) { + uint8_t tmp_mac[ETH_ALEN]; + struct in_addr tmp_ip; + unsigned short tmp_port; + payload_len -= testHeaderLen; + + tmp_port = testHeader->old_dst; + + udphdr->dest = tmp_port; + udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); + *len -= testHeaderLen; + + tmp_port = udphdr->dest; + udphdr->dest = udphdr->source; + udphdr->source = tmp_port; + + iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); + + memcpy(tmp_mac, eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, eth->h_source, ETH_ALEN); + memcpy(eth->h_source, tmp_mac, ETH_ALEN); + + memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); + memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); + memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); } - // if (*after1_1 < 2) { - // printf("after:\n"); - // __hex_dump(data, *len); - // *after1_1 = *after1_1 + 1; - // } - return; } -struct Args { +struct sock_args { int socket_id; - int *next; int next_size; }; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - // free(arg); - log_info("SOCKET_ID: %d", socket_id); - static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + nfds_t nfds = 1; + int ret, next_size; + struct socket *xsk; + struct xskvec *xskvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, count, nb_frags = 0; + struct sock_args *a = (struct sock_args *)arg; - // int idle_timeout = 1; - // uint64_t idle_timestamp = 0; + next_size = a->next_size; - log_info("2_NEXT_SIZE: %d", next_size); + log_debug("SOCKET_ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; } - cfg->xsk->poll_timeout = -1; - - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); - - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; - nf->thread[socket_id]->socket->idle_fd.fd = nf->thread[socket_id]->socket->fd; - nf->thread[socket_id]->socket->idle_fd.events = POLLIN; - - unsigned int count = 0; + count = 0; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret != 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - // ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - // if (ret <= 0 || ret > 1) - // continue; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - - // if (nrecv == 0) { - // uint64_t tstamp = rdtsc(); - - // if (idle_timeout && idle_timestamp == 0) { - // idle_timestamp = tstamp + ((get_timer_hz(cfg) / MS_PER_S) * idle_timeout); - // continue; - // } - - // if (idle_timestamp && (tstamp > idle_timestamp)) { - // idle_timestamp = 0; - - // ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - // if (ret) - // nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - // else - // continue; - // } - // } else - // idle_timestamp = 0; - - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - bool eop = IS_EOP_DESC(xv->options); - if (next_size != 0) { - xv->options = ((count % next_size) << 16) | (xv->options & 0xFFFF); + xskvecs[i].options = ((count % next_size) << 16) | (xskvecs[i].options & 0xFFFF); count++; } - char *pkt = xv->data; + + char *pkt = xskvecs[i].data; if (!nb_frags++) - process_packets(pkt, &xv->len, &stats_arr[socket_id]); + process_packets(pkt, &xskvecs[i].len, &stats_arr[a->socket_id]); - send[tot_pkt_send++] = &msg.msg_iov[i]; - if (eop) + if (IS_EOP_DESC(xskvecs[i].options)) nb_frags = 0; } if (nrecv) { - ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - if (ret != nrecv) { + nsend = flash__sendmsg(cfg, xsk, xskvecs, nrecv); + if (nsend != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); return NULL; } -static void *worker__stats(void *arg) +static void *worker__stats(void *conf) { - (void)arg; + struct stats_conf *arg = (struct stats_conf *)conf; + struct nf *nf = arg->nf; + struct config *cfg = arg->cfg; if (cfg->verbose) { unsigned int interval = cfg->stats_interval; @@ -422,11 +370,9 @@ static void *worker__stats(void *arg) log_error("Terminal clear error"); for (int i = 0; i < cfg->total_sockets; i++) { flash__dump_stats(cfg, nf->thread[i]->socket); - printf("%-18s %'-14llu\n", "dropped", stats_arr[i].pkt_dropped); - printf("%-18s %'-14llu\n", "corrupt", stats_arr[i].pkt_corrupted); - printf("%-18s %'-14llu\n", "correct", stats_arr[i].pkt_correct); - printf("%-18s %'-14llu\n", "even", stats_arr[i].even); - printf("%-18s %'-14llu\n", "odd", stats_arr[i].odd); + printf("%-18s %'-14lu\n", "dropped", stats_arr[i].pkt_dropped); + printf("%-18s %'-14lu\n", "corrupt", stats_arr[i].pkt_corrupted); + printf("%-18s %'-14lu\n", "correct", stats_arr[i].pkt_correct); } } } @@ -435,26 +381,37 @@ static void *worker__stats(void *arg) int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Correctness Test Application"; + cfg->app_options = correctness_options; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; stats_arr = calloc(cfg->total_sockets, sizeof(struct test_stats)); - stats_arr->even_next = 0; - stats_arr->odd_next = 1; - // bool_array = calloc(UINT32_MAX, sizeof(bool)); - // if (!bool_array) { - // fprintf(stderr, "ERROR: Unable to allocate memory for boolean array\n"); - // exit(EXIT_FAILURE); - // } + if (!stats_arr) { + log_error("ERROR: Memory allocation failed for stats_arr"); + goto out_cfg; + } log_info("Control Plane Setup Done"); @@ -464,49 +421,67 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); - for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } - log_info("2_NEXT_SIZE: %d", args->next_size); + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; + args[i].next_size = nf->next_size; - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + log_info("2_NEXT_SIZE: %d", args[i].next_size); - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, worker__stats, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; + } + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); flash__wait(cfg); flash__xsk_close(cfg, nf); return EXIT_SUCCESS; + +out_args: + done = true; + free(args); +out_cfg_close: + free(stats_arr); + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } From 493ede6504238d057f2472576b0612ec2b753c4b Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 30 Jun 2025 22:06:09 +0530 Subject: [PATCH 13/43] feat: optimised logging library - debug is compiled out in release builds - color logs can be configured using meson_options - all required dependencies are kept private --- examples/helloworld/main.c | 2 ++ examples/l2fwd/main.c | 3 ++- examples/simplefwd/main.c | 1 + examples/txgen/main.c | 3 ++- examples/unit-tests/fwddrop.c | 1 + examples/unit-tests/fwdrr.c | 1 + lib/flash/log/log.c | 37 ++++++++++++++++++++++++++++------- lib/flash/log/log.h | 36 ++++++++++++---------------------- lib/flash/log/meson.build | 16 ++++++++++++++- lib/flash/nf/flash_stats.c | 4 ++++ meson_options.txt | 5 +++++ 11 files changed, 75 insertions(+), 34 deletions(-) create mode 100644 meson_options.txt diff --git a/examples/helloworld/main.c b/examples/helloworld/main.c index c666df9..164e748 100644 --- a/examples/helloworld/main.c +++ b/examples/helloworld/main.c @@ -8,6 +8,8 @@ #include #include +#include + struct config *cfg = NULL; struct nf *nf = NULL; diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index e13159f..f3820d4 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -18,7 +19,7 @@ struct nf *nf = NULL; static void int_exit(int sig) { - log_debug("Received Signal: %d", sig); + log_info("Received Signal: %d", sig); done = true; } diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index c68b35e..c8abc8b 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include diff --git a/examples/txgen/main.c b/examples/txgen/main.c index b029a72..3e752ee 100644 --- a/examples/txgen/main.c +++ b/examples/txgen/main.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ uint8_t *packet_template = NULL; static void int_exit(int sig) { - log_debug("Received Signal: %d", sig); + log_info("Received Signal: %d", sig); done = true; } diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index 2cb44ba..5713e84 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index c68e212..f31c903 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/lib/flash/log/log.c b/lib/flash/log/log.c index 59d6ec7..10df2fc 100644 --- a/lib/flash/log/log.c +++ b/lib/flash/log/log.c @@ -19,13 +19,30 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * - * Taken from https://github.com/rxi/log.c + * Taken from https://github.com/rxi/log.c and modified for flash */ #include "log.h" +#include +#include +#include +#include +#include +#include + #define MAX_CALLBACKS 32 -#define LOG_USE_COLOR + +struct log_Event { + va_list ap; + const char *fmt; + const char *file; + const char *caller; + struct tm *time; + void *udata; + int line; + int level; +}; typedef struct { log_LogFn fn; @@ -39,7 +56,13 @@ static struct { int level; bool quiet; Callback callbacks[MAX_CALLBACKS]; -} L; +} L = { .level = +#ifdef LOG_ENABLE_DEBUG + LOG_TRACE, /* Debug builds: start with trace level */ +#else + LOG_INFO, /* Release builds: start with info level */ +#endif + .quiet = false }; static const char *level_strings[] = { "TRACE", "DEBUG", "INFO", "WARN", "ERROR", "FATAL" }; @@ -102,9 +125,9 @@ void log_set_level(int level) L.level = level; } -void log_set_quiet(bool enable) +void log_set_quiet(int enable) { - L.quiet = enable; + L.quiet = enable != false; } int log_add_callback(log_LogFn fn, void *udata, int level) @@ -118,9 +141,9 @@ int log_add_callback(log_LogFn fn, void *udata, int level) return -1; } -int log_add_fp(FILE *fp, int level) +int log_add_fp(void *fp, int level) { - return log_add_callback(file_callback, fp, level); + return log_add_callback(file_callback, (FILE *)fp, level); /* Cast void* back to FILE* */ } static void init_event(log_Event *ev, void *udata) diff --git a/lib/flash/log/log.h b/lib/flash/log/log.h index acd4bf0..f5efdc2 100644 --- a/lib/flash/log/log.h +++ b/lib/flash/log/log.h @@ -4,50 +4,38 @@ * This library is free software; you can redistribute it and/or modify it * under the terms of the MIT license. See `log.c` for details. * - * Taken from https://github.com/rxi/log.c + * Taken from https://github.com/rxi/log.c and modified for flash */ #ifndef LOG_H #define LOG_H -#include -#include -#include -#include -#include -#include - -#define LOG_VERSION "0.1.0" - -typedef struct { - va_list ap; - const char *fmt; - const char *file; - const char *caller; - struct tm *time; - void *udata; - int line; - int level; -} log_Event; - +typedef struct log_Event log_Event; typedef void (*log_LogFn)(log_Event *ev); -typedef void (*log_LockFn)(bool lock, void *udata); +typedef void (*log_LockFn)(int lock, void *udata); enum { LOG_TRACE, LOG_DEBUG, LOG_INFO, LOG_WARN, LOG_ERROR, LOG_FATAL }; +// clang-format off +#ifdef LOG_ENABLE_DEBUG #define log_trace(...) log_log(LOG_TRACE, __FILE__, __LINE__, __func__, __VA_ARGS__) #define log_debug(...) log_log(LOG_DEBUG, __FILE__, __LINE__, __func__, __VA_ARGS__) +#else +#define log_trace(...) do { } while (0) +#define log_debug(...) do { } while (0) +#endif #define log_info(...) log_log(LOG_INFO, __FILE__, __LINE__, __func__, __VA_ARGS__) #define log_warn(...) log_log(LOG_WARN, __FILE__, __LINE__, __func__, __VA_ARGS__) #define log_error(...) log_log(LOG_ERROR, __FILE__, __LINE__, __func__, __VA_ARGS__) #define log_fatal(...) log_log(LOG_FATAL, __FILE__, __LINE__, __func__, __VA_ARGS__) +// clang-format on const char *log_level_string(int level); void log_set_lock(log_LockFn fn, void *udata); void log_set_level(int level); -void log_set_quiet(bool enable); +void log_set_quiet(int enable); int log_add_callback(log_LogFn fn, void *udata, int level); -int log_add_fp(FILE *fp, int level); +int log_add_fp(void *fp, int level); void log_set_level_from_env(void); void log_log(int level, const char *file, int line, const char *caller, const char *fmt, ...); diff --git a/lib/flash/log/meson.build b/lib/flash/log/meson.build index 0cc4d73..11e5b01 100644 --- a/lib/flash/log/meson.build +++ b/lib/flash/log/meson.build @@ -4,7 +4,21 @@ sources = files('log.c') headers = files('log.h') -liblog = library(libname, sources, install: true) +log_c_args = [] +if get_option('log_use_color') + log_c_args += ['-DLOG_USE_COLOR'] +endif + +if get_option('buildtype') == 'debug' or get_option('buildtype') == 'debugoptimized' + log_c_args += ['-DLOG_ENABLE_DEBUG'] + message('Log library: Debug/trace logging enabled for build type: ' + get_option('buildtype')) +else + message('Log library: Debug/trace logging disabled for build type: ' + get_option('buildtype')) +endif + +liblog = library(libname, sources, + install: true, + c_args: log_c_args) log = declare_dependency(link_with: liblog, include_directories: include_directories('.')) flash_libs += log \ No newline at end of file diff --git a/lib/flash/nf/flash_stats.c b/lib/flash/nf/flash_stats.c index 8ead395..c7acfca 100644 --- a/lib/flash/nf/flash_stats.c +++ b/lib/flash/nf/flash_stats.c @@ -6,6 +6,10 @@ #include #include #include +#include +#include +#include +#include #include "flash_nf.h" diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..b9610e5 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Debojeet Das + +option('log_use_color', type: 'boolean', value: true, + description: 'Enable colored output in the log library') From cadc7ddb00ac8d99a0467680eb07d16a7efcece0 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 30 Jun 2025 22:58:33 +0530 Subject: [PATCH 14/43] feat: rust build support using meson - the commit tries to fix issue #1 - default is set to false --- examples/meson.build | 48 +++++++++++++++++++++++++++++++++++++++++++- meson.build | 21 ++++++++++++++++++- meson_options.txt | 3 +++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/examples/meson.build b/examples/meson.build index c6afb34..d9cd80e 100644 --- a/examples/meson.build +++ b/examples/meson.build @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2025 Debojeet Das +# C examples dirs = [ 'unit-tests', 'helloworld', @@ -15,6 +16,19 @@ dirs = [ 'txgen' ] +# Rust examples (only if Rust is enabled) +rust_dirs = [] +if get_option('enable_rust') + rust_dirs = [ + 'helloworld-rs', + 'simplefwd-rs', + 'l2fwd-rs', + 'ip4ping-rs', + 'maglev-rs', + 'firewall-rs' + ] +endif + def_deps = [include, log, nf, params, uds] foreach d : dirs @@ -27,4 +41,36 @@ foreach d : dirs subdir(d) install_headers(headers, subdir: meson.project_name().to_lower()) -endforeach \ No newline at end of file +endforeach + +if get_option('enable_rust') and cargo.found() + message('>>> Configuring Rust examples') + + cargo_build_args = ['build', '--target-dir', rust_target_dir] + + if get_option('buildtype') == 'release' + cargo_build_args += ['--release'] + message('Rust: Building in release mode') + else + message('Rust: Building in debug mode') + endif + + if get_option('buildtype') == 'debug' or get_option('buildtype') == 'debugoptimized' + cargo_build_args += ['-F', 'tracing'] + endif + + rust_build = custom_target( + 'rust_workspace', + output: 'rust_build_complete', + command: [cargo] + cargo_build_args, + console: true, + build_by_default: true, + build_always_stale: true, + ) + + foreach rust_dir : rust_dirs + message('Rust example: ' + rust_dir + ' will be built with the workspace') + endforeach + + message('<<< Rust examples configured') +endif \ No newline at end of file diff --git a/meson.build b/meson.build index 35f38ba..5e35e65 100644 --- a/meson.build +++ b/meson.build @@ -19,6 +19,19 @@ use_static_libs = get_option('default_library') == 'static' cc = meson.get_compiler('c') +enable_rust = get_option('enable_rust') +cargo = find_program('cargo', required: enable_rust) +rust_build = [] +rust_target_dir = '' +rust_profile = '' + +if enable_rust and cargo.found() + message('Rust support enabled') + rust_target_dir = meson.current_build_dir() / 'rust-target' +else + message('Rust support disabled') +endif + build_dir = meson.current_build_dir() flash_libs = [] enabled_libs = [] @@ -171,4 +184,10 @@ pkg.generate( description: '''FLASH userspace library for AF_XDP userspace applications''', install_dir: 'lib/pkgconfig', ) -message('<<< Done pkg-config file') \ No newline at end of file +message('<<< Done pkg-config file') + +# Rust builds are now handled in examples/meson.build +if enable_rust and cargo.found() + message('>>> Rust builds will be handled by examples/meson.build') + message('<<< Rust builds configured') +endif \ No newline at end of file diff --git a/meson_options.txt b/meson_options.txt index b9610e5..6fe49f6 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -3,3 +3,6 @@ option('log_use_color', type: 'boolean', value: true, description: 'Enable colored output in the log library') + +option('enable_rust', type: 'boolean', value: false, + description: 'Enable building Rust applications and libraries') From c950f0487590246ad8ad37efceb224044e21ed53 Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Thu, 10 Jul 2025 13:08:39 +0800 Subject: [PATCH 15/43] refactor(rust): modular, added pooling, fixed types - refactored modules - added pool feature - updated data types - removed poll timeout --- lib/flash-rs/Cargo.toml | 2 + lib/flash-rs/src/client.rs | 73 +++--- lib/flash-rs/src/config/common.rs | 4 +- lib/flash-rs/src/config/config_clap.rs | 4 +- lib/flash-rs/src/config/config_noclap.rs | 4 +- lib/flash-rs/src/error.rs | 3 +- lib/flash-rs/src/fd/error.rs | 19 ++ lib/flash-rs/src/{xsk => fd}/fd.rs | 83 ++++--- lib/flash-rs/src/fd/mod.rs | 7 + lib/flash-rs/src/{xsk => fd}/xdp.rs | 13 +- lib/flash-rs/src/lib.rs | 6 +- lib/flash-rs/src/mem/desc.rs | 57 +++++ lib/flash-rs/src/mem/error.rs | 5 +- lib/flash-rs/src/mem/mod.rs | 12 + lib/flash-rs/src/mem/pool.rs | 39 +++ lib/flash-rs/src/mem/ring/comp.rs | 64 +++++ lib/flash-rs/src/mem/ring/error.rs | 4 + lib/flash-rs/src/mem/ring/fill.rs | 95 +++++++ lib/flash-rs/src/mem/ring/mod.rs | 21 ++ lib/flash-rs/src/mem/ring/rx.rs | 64 +++++ lib/flash-rs/src/mem/ring/tx.rs | 69 ++++++ lib/flash-rs/src/mem/umem.rs | 19 +- lib/flash-rs/src/{xsk => }/stats.rs | 15 +- lib/flash-rs/src/uds/client.rs | 54 ++-- lib/flash-rs/src/uds/conn.rs | 8 + lib/flash-rs/src/uds/error.rs | 12 +- lib/flash-rs/src/xsk/desc.rs | 19 -- lib/flash-rs/src/xsk/error.rs | 27 +- lib/flash-rs/src/xsk/mod.rs | 11 - lib/flash-rs/src/xsk/ring.rs | 249 ------------------- lib/flash-rs/src/xsk/socket.rs | 302 ++++++++++++++--------- 31 files changed, 804 insertions(+), 560 deletions(-) create mode 100644 lib/flash-rs/src/fd/error.rs rename lib/flash-rs/src/{xsk => fd}/fd.rs (51%) create mode 100644 lib/flash-rs/src/fd/mod.rs rename lib/flash-rs/src/{xsk => fd}/xdp.rs (81%) create mode 100644 lib/flash-rs/src/mem/desc.rs create mode 100644 lib/flash-rs/src/mem/pool.rs create mode 100644 lib/flash-rs/src/mem/ring/comp.rs create mode 100644 lib/flash-rs/src/mem/ring/error.rs create mode 100644 lib/flash-rs/src/mem/ring/fill.rs create mode 100644 lib/flash-rs/src/mem/ring/mod.rs create mode 100644 lib/flash-rs/src/mem/ring/rx.rs create mode 100644 lib/flash-rs/src/mem/ring/tx.rs rename lib/flash-rs/src/{xsk => }/stats.rs (83%) delete mode 100644 lib/flash-rs/src/xsk/desc.rs delete mode 100644 lib/flash-rs/src/xsk/ring.rs diff --git a/lib/flash-rs/Cargo.toml b/lib/flash-rs/Cargo.toml index 94b0c22..7dd2d78 100644 --- a/lib/flash-rs/Cargo.toml +++ b/lib/flash-rs/Cargo.toml @@ -12,6 +12,7 @@ clap = { version = "4.5.35", features = ["derive"], optional = true } libc = "0.2.171" libxdp-sys = "0.2.1" quanta = "0.12.5" +ringbuffer = { version = "0.15.0", optional = true } thiserror = "2.0.12" tracing = { version = "0.1.41", optional = true } uds = "0.4.2" @@ -19,5 +20,6 @@ uds = "0.4.2" [features] default = [] clap = ["dep:clap"] +pool = ["dep:ringbuffer"] stats = [] tracing = ["dep:tracing"] diff --git a/lib/flash-rs/src/client.rs b/lib/flash-rs/src/client.rs index 45392f5..aa61a84 100644 --- a/lib/flash-rs/src/client.rs +++ b/lib/flash-rs/src/client.rs @@ -3,13 +3,14 @@ use std::{net::Ipv4Addr, str::FromStr, sync::Arc}; use crate::{ FlashError, Socket, config::{BindFlags, FlashConfig, Mode, PollConfig, XskConfig}, + fd::Fd, mem::Umem, uds::UdsClient, - xsk::{Fd, SocketShared}, + xsk::SocketShared, }; #[cfg(feature = "stats")] -use crate::{config::XdpFlags, xsk::Stats}; +use crate::{config::XdpFlags, stats::Stats}; pub struct Route { pub ip_addr: Ipv4Addr, @@ -52,43 +53,35 @@ pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> #[cfg(feature = "tracing")] tracing::debug!("Mode: {mode:?}"); - let poll_timeout = if mode.contains(Mode::FLASH_POLL) { - uds_client.get_poll_timeout()? - } else { - 0 - }; + // let poll_timeout = if mode.contains(Mode::FLASH_POLL) { + // uds_client.get_poll_timeout()? + // } else { + // 0 + // }; - #[cfg(feature = "tracing")] - tracing::debug!("Poll Timeout: {poll_timeout}"); + // #[cfg(feature = "tracing")] + // tracing::debug!("Poll Timeout: {poll_timeout}"); let mut socket_info = Vec::with_capacity(total_sockets); for _ in 0..total_sockets { - #[cfg(feature = "stats")] + #[cfg(any(feature = "stats", feature = "tracing"))] let (fd, ifqueue) = uds_client.create_socket()?; - #[cfg(not(feature = "stats"))] + #[cfg(not(any(feature = "stats", feature = "tracing")))] let (fd, _) = uds_client.create_socket()?; - let fd = Fd::new(fd, poll_timeout)?; - #[cfg(feature = "tracing")] - { - #[cfg(feature = "stats")] - tracing::debug!( - "Socket: {} :: FD: {fd:?} Ifqueue: {ifqueue}", - socket_info.len() - ); - - #[cfg(not(feature = "stats"))] - tracing::debug!("Socket: {} :: FD: {fd:?}", socket_info.len()); - } + tracing::debug!( + "Socket: {} :: FD: {fd:?} Ifqueue: {ifqueue}", + socket_info.len() + ); #[cfg(feature = "stats")] - socket_info.push((fd, ifqueue)); + socket_info.push((Fd::new(fd), ifqueue)); #[cfg(not(feature = "stats"))] - socket_info.push(fd); + socket_info.push(Fd::new(fd)); } #[cfg(feature = "stats")] @@ -97,6 +90,11 @@ pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> #[cfg(all(feature = "stats", feature = "tracing"))] tracing::debug!("Ifname: {ifname}"); + // let route_size = uds_client.get_route_info()?; + + // #[cfg(feature = "tracing")] + // tracing::debug!("Route Size: {route_size}"); + let route = Route { ip_addr: Ipv4Addr::from_str(&uds_client.get_ip_addr()?)?, next: uds_client @@ -121,12 +119,16 @@ pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> let socket_shared = Arc::new(SocketShared::new(xsk_config, poll_config, uds_client)); #[cfg(feature = "stats")] - let mut sockets = socket_info + let sockets = socket_info .into_iter() - .map(|(fd, ifqueue)| { + .enumerate() + .map(|(i, (fd, ifqueue))| { Socket::new( fd.clone(), - Umem::new(umem_fd, umem_size, umem_scale, umem_offset)?, + Umem::new(umem_fd, umem_size)?, + i, + umem_scale, + umem_offset, Stats::new(fd, ifname.clone(), ifqueue, xdp_flags.clone()), socket_shared.clone(), ) @@ -134,21 +136,20 @@ pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> .collect::, _>>()?; #[cfg(not(feature = "stats"))] - let mut sockets = socket_info + let sockets = socket_info .into_iter() - .map(|fd| { + .enumerate() + .map(|(i, fd)| { Socket::new( fd.clone(), - Umem::new(umem_fd, umem_size, umem_scale, umem_offset)?, + Umem::new(umem_fd, umem_size)?, + i, + umem_scale, + umem_offset, socket_shared.clone(), ) }) .collect::, _>>()?; - sockets - .iter_mut() - .enumerate() - .try_for_each(|(i, socket)| socket.populate_fq(i))?; - Ok((sockets, route)) } diff --git a/lib/flash-rs/src/config/common.rs b/lib/flash-rs/src/config/common.rs index ea0ffad..8ddd391 100644 --- a/lib/flash-rs/src/config/common.rs +++ b/lib/flash-rs/src/config/common.rs @@ -5,8 +5,8 @@ use super::FlashConfig; impl FlashConfig { #[allow(clippy::must_use_candidate)] pub fn new( - umem_id: u32, - nf_id: u32, + umem_id: u16, + nf_id: u16, smart_poll: bool, idle_timeout: Duration, idleness: f32, diff --git a/lib/flash-rs/src/config/config_clap.rs b/lib/flash-rs/src/config/config_clap.rs index b599c71..985ef14 100644 --- a/lib/flash-rs/src/config/config_clap.rs +++ b/lib/flash-rs/src/config/config_clap.rs @@ -5,10 +5,10 @@ use clap::Parser; #[derive(Debug, Parser)] pub struct FlashConfig { #[arg(short, long, help = "Umem id used to connect to monitor")] - pub(crate) umem_id: u32, + pub(crate) umem_id: u16, #[arg(short = 'f', long, help = "NF id used to connect to monitor")] - pub(crate) nf_id: u32, + pub(crate) nf_id: u16, #[arg( short = 'p', diff --git a/lib/flash-rs/src/config/config_noclap.rs b/lib/flash-rs/src/config/config_noclap.rs index 6a8f8da..1ab9cff 100644 --- a/lib/flash-rs/src/config/config_noclap.rs +++ b/lib/flash-rs/src/config/config_noclap.rs @@ -2,8 +2,8 @@ use std::time::Duration; #[derive(Debug)] pub struct FlashConfig { - pub(crate) umem_id: u32, - pub(crate) nf_id: u32, + pub(crate) umem_id: u16, + pub(crate) nf_id: u16, pub(crate) smart_poll: bool, pub(crate) idle_timeout: Duration, pub(crate) idleness: f32, diff --git a/lib/flash-rs/src/error.rs b/lib/flash-rs/src/error.rs index 4c915ca..2a45b51 100644 --- a/lib/flash-rs/src/error.rs +++ b/lib/flash-rs/src/error.rs @@ -1,6 +1,6 @@ use std::{io, net::AddrParseError}; -use crate::{config::ConfigError, uds::UdsError, xsk::SocketError}; +use crate::{config::ConfigError, fd::FdError, uds::UdsError, xsk::SocketError}; #[derive(Debug, thiserror::Error)] #[error("flash error: {0}")] @@ -10,5 +10,6 @@ pub enum FlashError { Config(#[from] ConfigError), UDS(#[from] UdsError), + Fd(#[from] FdError), Socket(#[from] SocketError), } diff --git a/lib/flash-rs/src/fd/error.rs b/lib/flash-rs/src/fd/error.rs new file mode 100644 index 0000000..2519b2f --- /dev/null +++ b/lib/flash-rs/src/fd/error.rs @@ -0,0 +1,19 @@ +use std::io; + +pub(super) type FdResult = Result; + +#[derive(Debug, thiserror::Error)] +#[error("fd error: {0}")] +pub enum FdError { + IO(#[from] io::Error), + + #[error("fd error: optlen does not match struct size")] + SockOptSize, +} + +impl FdError { + #[inline] + pub(crate) fn last_os_error() -> Self { + FdError::IO(io::Error::last_os_error()) + } +} diff --git a/lib/flash-rs/src/xsk/fd.rs b/lib/flash-rs/src/fd/fd.rs similarity index 51% rename from lib/flash-rs/src/xsk/fd.rs rename to lib/flash-rs/src/fd/fd.rs index a8258bc..8815ff9 100644 --- a/lib/flash-rs/src/xsk/fd.rs +++ b/lib/flash-rs/src/fd/fd.rs @@ -1,57 +1,74 @@ -use std::{io, ptr}; +use std::{fmt, io, ptr}; -use libc::{MSG_DONTWAIT, SOL_XDP, XDP_MMAP_OFFSETS, pollfd, ssize_t}; +use libc::{ + EAGAIN, EBUSY, ENETDOWN, ENOBUFS, MSG_DONTWAIT, SOL_XDP, XDP_MMAP_OFFSETS, pollfd, ssize_t, +}; #[cfg(feature = "stats")] use libc::XDP_STATISTICS; -use crate::mem::{MemError, Mmap}; +use crate::{ + mem::{MemError, Mmap}, + util, +}; use super::{ - error::{SocketError, SocketResult}, + error::{FdError, FdResult}, xdp::{XDP_MMAP_OFFSETS_SIZEOF, XdpMmapOffsets}, }; #[cfg(feature = "stats")] use super::xdp::{XDP_STATISTICS_SIZEOF, XdpStatistics}; -#[allow(clippy::struct_field_names)] -#[derive(Clone, Debug)] +#[derive(Clone)] pub(crate) struct Fd { id: i32, poll_fd: pollfd, - poll_timeout: i32, + // poll_timeout: i32, +} + +impl fmt::Debug for Fd { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self, f) + } } impl Fd { - pub(crate) fn new(id: i32, poll_timeout: i32) -> SocketResult { - if id < 0 { - Err(SocketError::InvalidFd) - } else { - Ok(Fd { - id, - poll_fd: pollfd { - fd: id, - events: libc::POLLIN, - revents: 0, - }, - poll_timeout, - }) + pub(crate) fn new(id: i32) -> Self { + assert!(id >= 0, "Invalid file descriptor: {id}"); + + Fd { + id, + poll_fd: pollfd { + fd: id, + events: libc::POLLIN, + revents: 0, + }, + // poll_timeout, } } #[inline] - pub(super) fn mmap(&self, len: usize, offset: i64) -> Result { + pub(crate) fn mmap(&self, len: usize, offset: i64) -> Result { Mmap::new(len, self.id, offset, true) } #[inline] - pub(super) fn kick(&self) -> ssize_t { - unsafe { libc::sendto(self.id, ptr::null(), 0, MSG_DONTWAIT, ptr::null(), 0) } + pub(crate) fn kick(&self) -> Result { + let n = unsafe { libc::sendto(self.id, ptr::null(), 0, MSG_DONTWAIT, ptr::null(), 0) }; + + if n >= 0 { + Ok(n) + } else { + match util::get_errno() { + ENOBUFS | EAGAIN | EBUSY | ENETDOWN => Ok(0), + _ => Err(()), + } + } } #[inline] - pub(super) fn wakeup(&self) { + pub(crate) fn wakeup(&self) { unsafe { libc::recvfrom( self.id, @@ -65,15 +82,15 @@ impl Fd { } #[inline] - pub(super) fn poll(&mut self) -> io::Result { - match unsafe { libc::poll(&mut self.poll_fd, 1, self.poll_timeout) } { + pub(crate) fn poll(&mut self) -> io::Result { + match unsafe { libc::poll(&raw mut self.poll_fd, 1, -1) } { -1 => Err(io::Error::last_os_error()), 0 => Ok(false), _ => Ok(true), } } - pub(super) fn xdp_mmap_offsets(&self) -> SocketResult { + pub(crate) fn xdp_mmap_offsets(&self) -> FdResult { let mut off = XdpMmapOffsets::default(); let mut optlen = XDP_MMAP_OFFSETS_SIZEOF; @@ -83,20 +100,20 @@ impl Fd { SOL_XDP, XDP_MMAP_OFFSETS, (&raw mut off).cast(), - &mut optlen, + &raw mut optlen, ) } != 0 { - Err(SocketError::last_os_error()) + Err(FdError::last_os_error()) } else if optlen == XDP_MMAP_OFFSETS_SIZEOF { Ok(off) } else { - Err(SocketError::SockOptSize) + Err(FdError::SockOptSize) } } #[cfg(feature = "stats")] - pub(super) fn xdp_statistics(&self) -> SocketResult { + pub(crate) fn xdp_statistics(&self) -> FdResult { let mut stats = XdpStatistics::default(); let mut optlen = XDP_STATISTICS_SIZEOF; @@ -110,11 +127,11 @@ impl Fd { ) } != 0 { - Err(SocketError::last_os_error()) + Err(FdError::last_os_error()) } else if optlen == XDP_STATISTICS_SIZEOF { Ok(stats) } else { - Err(SocketError::SockOptSize) + Err(FdError::SockOptSize) } } } diff --git a/lib/flash-rs/src/fd/mod.rs b/lib/flash-rs/src/fd/mod.rs new file mode 100644 index 0000000..f2b28c1 --- /dev/null +++ b/lib/flash-rs/src/fd/mod.rs @@ -0,0 +1,7 @@ +mod error; +mod fd; +mod xdp; + +pub(crate) use fd::Fd; + +pub use error::FdError; diff --git a/lib/flash-rs/src/xsk/xdp.rs b/lib/flash-rs/src/fd/xdp.rs similarity index 81% rename from lib/flash-rs/src/xsk/xdp.rs rename to lib/flash-rs/src/fd/xdp.rs index 718c27e..dbda5a4 100644 --- a/lib/flash-rs/src/xsk/xdp.rs +++ b/lib/flash-rs/src/fd/xdp.rs @@ -13,7 +13,7 @@ pub(super) const XDP_MMAP_OFFSETS_SIZEOF: u32 = mem::size_of:: pub(super) const XDP_STATISTICS_SIZEOF: u32 = mem::size_of::() as _; #[repr(transparent)] -pub(super) struct XdpMmapOffsets(xdp_mmap_offsets); +pub(crate) struct XdpMmapOffsets(xdp_mmap_offsets); impl Default for XdpMmapOffsets { fn default() -> Self { @@ -37,30 +37,29 @@ fn new_xdp_ring_offset() -> xdp_ring_offset { impl XdpMmapOffsets { #[inline] - pub(super) fn rx(&self) -> &xdp_ring_offset { + pub(crate) fn rx(&self) -> &xdp_ring_offset { &self.0.rx } #[inline] - pub(super) fn tx(&self) -> &xdp_ring_offset { + pub(crate) fn tx(&self) -> &xdp_ring_offset { &self.0.tx } #[inline] - pub(super) fn fr(&self) -> &xdp_ring_offset { + pub(crate) fn fr(&self) -> &xdp_ring_offset { &self.0.fr } #[inline] - pub(super) fn cr(&self) -> &xdp_ring_offset { + pub(crate) fn cr(&self) -> &xdp_ring_offset { &self.0.cr } } #[cfg(feature = "stats")] -#[derive(Debug)] #[repr(transparent)] -pub struct XdpStatistics(xdp_statistics); +pub(crate) struct XdpStatistics(xdp_statistics); #[cfg(feature = "stats")] impl Default for XdpStatistics { diff --git a/lib/flash-rs/src/lib.rs b/lib/flash-rs/src/lib.rs index ca6b5f5..c4e3334 100644 --- a/lib/flash-rs/src/lib.rs +++ b/lib/flash-rs/src/lib.rs @@ -1,11 +1,15 @@ mod client; mod config; mod error; +mod fd; mod mem; mod uds; mod util; mod xsk; +#[cfg(feature = "stats")] +mod stats; + pub use crate::{ client::{Route, connect}, config::FlashConfig, @@ -14,4 +18,4 @@ pub use crate::{ }; #[cfg(feature = "stats")] -pub use crate::xsk::Stats; +pub use stats::Stats; diff --git a/lib/flash-rs/src/mem/desc.rs b/lib/flash-rs/src/mem/desc.rs new file mode 100644 index 0000000..93ec144 --- /dev/null +++ b/lib/flash-rs/src/mem/desc.rs @@ -0,0 +1,57 @@ +use libxdp_sys::{xdp_desc, xsk_umem__add_offset_to_addr, xsk_umem__extract_addr}; + +use crate::mem::FRAME_SIZE; + +#[derive(Debug)] +pub struct Desc { + pub(super) addr: u64, + pub(super) len: u32, + options: u32, +} + +impl From for Desc { + #[inline] + fn from(addr: u64) -> Self { + Self { + addr, + len: FRAME_SIZE, + options: 0, + } + } +} + +impl From<&xdp_desc> for Desc { + #[inline] + fn from(desc: &xdp_desc) -> Self { + Self { + addr: unsafe { xsk_umem__add_offset_to_addr(desc.addr) }, + len: desc.len, + options: desc.options, + } + } +} + +impl Desc { + #[inline] + pub fn len(&self) -> usize { + self.len as usize + } + + #[allow(clippy::cast_possible_truncation)] + #[inline] + pub fn set_next(&mut self, idx: usize) { + self.options = (self.options & 0xFFFF) | ((idx as u32) << 16); + } + + #[inline] + pub(crate) fn extract_addr(self) -> u64 { + unsafe { xsk_umem__extract_addr(self.addr) } + } + + #[inline] + pub(crate) fn copy_to(&self, desc: &mut xdp_desc) { + desc.addr = self.addr; + desc.len = self.len; + desc.options = self.options & 0xFFFF_0000; + } +} diff --git a/lib/flash-rs/src/mem/error.rs b/lib/flash-rs/src/mem/error.rs index 1b605bf..db211ba 100644 --- a/lib/flash-rs/src/mem/error.rs +++ b/lib/flash-rs/src/mem/error.rs @@ -3,8 +3,8 @@ use std::io; pub(super) type MemResult = Result; #[derive(Debug, thiserror::Error)] +#[error("mem error: {0}")] pub enum MemError { - #[error("mem error: {0}")] IO(#[from] io::Error), #[error("mem error: mmap not page aligned")] @@ -12,6 +12,9 @@ pub enum MemError { #[error("mem error: mmap offset out of bounds")] MmapOffset, + + #[error("mem error: could not populate fq")] + FqPopulate, } impl MemError { diff --git a/lib/flash-rs/src/mem/mod.rs b/lib/flash-rs/src/mem/mod.rs index 37b064c..f6e16c4 100644 --- a/lib/flash-rs/src/mem/mod.rs +++ b/lib/flash-rs/src/mem/mod.rs @@ -1,8 +1,20 @@ +mod desc; mod error; mod mmap; +mod ring; mod umem; +#[cfg(feature = "pool")] +mod pool; + +pub(crate) use desc::Desc; pub(crate) use mmap::Mmap; +pub(crate) use ring::{CompRing, Cons, FillRing, Prod, RxRing, TxRing}; pub(crate) use umem::Umem; +#[cfg(feature = "pool")] +pub(crate) use pool::Pool; + pub use error::MemError; + +const FRAME_SIZE: u32 = libxdp_sys::XSK_UMEM__DEFAULT_FRAME_SIZE; diff --git a/lib/flash-rs/src/mem/pool.rs b/lib/flash-rs/src/mem/pool.rs new file mode 100644 index 0000000..991735a --- /dev/null +++ b/lib/flash-rs/src/mem/pool.rs @@ -0,0 +1,39 @@ +use libxdp_sys::XSK_RING_PROD__DEFAULT_NUM_DESCS; +use ringbuffer::{AllocRingBuffer, RingBuffer}; + +use super::FRAME_SIZE; + +#[derive(Debug)] +pub(crate) struct Pool(AllocRingBuffer); + +impl Pool { + pub(crate) fn new(scale: u32, offset: u64) -> Self { + let frame_size = u64::from(FRAME_SIZE); + let nr_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; + + let mut ring_buffer = AllocRingBuffer::new(2 * nr_frames as usize); + + let mut addr = (offset + u64::from(nr_frames)) * frame_size; + for _ in 0..nr_frames { + ring_buffer.push(addr); + addr += frame_size; + } + + Self(ring_buffer) + } + + #[inline] + pub(crate) fn get(&mut self) -> Option { + self.0.dequeue() + } + + #[inline] + pub(crate) fn put(&mut self, addr: u64) { + self.0.push(addr); + } + + #[inline] + pub(crate) fn extend(&mut self, iter: impl IntoIterator) { + self.0.extend(iter); + } +} diff --git a/lib/flash-rs/src/mem/ring/comp.rs b/lib/flash-rs/src/mem/ring/comp.rs new file mode 100644 index 0000000..bd18152 --- /dev/null +++ b/lib/flash-rs/src/mem/ring/comp.rs @@ -0,0 +1,64 @@ +use std::mem; + +use libc::{XDP_UMEM_PGOFF_COMPLETION_RING, size_t, xdp_ring_offset}; +use libxdp_sys::{ + XSK_RING_CONS__DEFAULT_NUM_DESCS, xsk_ring_cons, xsk_ring_cons__comp_addr, xsk_ring_cons__peek, + xsk_ring_cons__release, +}; + +use crate::{fd::Fd, mem::Mmap}; + +use super::{Cons, error::RingResult}; + +#[derive(Debug)] +pub(crate) struct CompRing { + ring: xsk_ring_cons, + _mmap: Mmap, +} + +unsafe impl Send for CompRing {} + +impl CompRing { + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + let comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * scale; + + let mmap = fd.mmap( + off.desc as size_t + comp_size as size_t * mem::size_of::(), + XDP_UMEM_PGOFF_COMPLETION_RING as _, + )?; + + let (prod, cons, ring, flags) = mmap.add_offset(off)?; + + Ok(Self { + ring: xsk_ring_cons { + cached_prod: 0, + cached_cons: 0, + mask: comp_size - 1, + size: comp_size, + producer: prod, + consumer: cons, + ring, + flags, + }, + _mmap: mmap, + }) + } + + #[inline] + pub(crate) fn addr(&self, idx: u32) -> Option<&u64> { + unsafe { xsk_ring_cons__comp_addr(&raw const self.ring, idx).as_ref() } + } +} + +impl Cons for CompRing { + #[inline] + fn peek(&mut self, nb: u32, idx: &mut u32) -> u32 { + unsafe { xsk_ring_cons__peek(&raw mut self.ring, nb, idx) } + } + + #[inline] + fn release(&mut self, nb: u32) { + unsafe { xsk_ring_cons__release(&raw mut self.ring, nb) } + } +} diff --git a/lib/flash-rs/src/mem/ring/error.rs b/lib/flash-rs/src/mem/ring/error.rs new file mode 100644 index 0000000..02cea43 --- /dev/null +++ b/lib/flash-rs/src/mem/ring/error.rs @@ -0,0 +1,4 @@ +use crate::mem::MemError; + +pub(super) type RingResult = Result; +pub(super) type RingError = MemError; diff --git a/lib/flash-rs/src/mem/ring/fill.rs b/lib/flash-rs/src/mem/ring/fill.rs new file mode 100644 index 0000000..d9c8ec0 --- /dev/null +++ b/lib/flash-rs/src/mem/ring/fill.rs @@ -0,0 +1,95 @@ +use std::mem; + +use libc::{XDP_UMEM_PGOFF_FILL_RING, size_t, xdp_ring_offset}; +use libxdp_sys::{ + XSK_RING_PROD__DEFAULT_NUM_DESCS, xsk_ring_prod, xsk_ring_prod__fill_addr, + xsk_ring_prod__needs_wakeup, xsk_ring_prod__reserve, xsk_ring_prod__submit, +}; + +use crate::{ + fd::Fd, + mem::{FRAME_SIZE, Mmap}, +}; + +use super::{Prod, error::RingError, error::RingResult}; + +#[derive(Debug)] +pub(crate) struct FillRing { + ring: xsk_ring_prod, + _mmap: Mmap, +} + +unsafe impl Send for FillRing {} + +impl FillRing { + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + let fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2 * scale; + + let mmap = fd.mmap( + off.desc as size_t + fill_size as size_t * mem::size_of::(), + XDP_UMEM_PGOFF_FILL_RING as _, + )?; + + let (prod, cons, ring, flags) = mmap.add_offset(off)?; + + Ok(Self { + ring: xsk_ring_prod { + cached_prod: 0, + cached_cons: fill_size, + mask: fill_size - 1, + size: fill_size, + producer: prod, + consumer: cons, + ring, + flags, + }, + _mmap: mmap, + }) + } + + #[inline] + pub(crate) fn addr(&mut self, idx: u32) -> Option<&mut u64> { + unsafe { xsk_ring_prod__fill_addr(&raw mut self.ring, idx).as_mut() } + } + + pub(crate) fn populate(&mut self, scale: u32, offset: u64) -> RingResult<()> { + let frame_size = u64::from(FRAME_SIZE); + let nr_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; + + let mut idx_fq = 0; + if self.reserve(nr_frames, &mut idx_fq) != nr_frames { + return Err(RingError::FqPopulate); + } + + let mut addr = offset * frame_size; + for _ in 0..nr_frames { + if let Some(fill_addr) = self.addr(idx_fq) { + *fill_addr = addr; + } + + idx_fq += 1; + addr += frame_size; + } + + self.submit(nr_frames); + Ok(()) + } +} + +impl Prod for FillRing { + #[inline] + fn needs_wakeup(&self) -> bool { + unsafe { xsk_ring_prod__needs_wakeup(&raw const self.ring) != 0 } + } + + #[inline] + fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32 { + unsafe { xsk_ring_prod__reserve(&raw mut self.ring, nb, idx) } + } + + #[inline] + fn submit(&mut self, nb: u32) { + unsafe { xsk_ring_prod__submit(&raw mut self.ring, nb) } + } +} diff --git a/lib/flash-rs/src/mem/ring/mod.rs b/lib/flash-rs/src/mem/ring/mod.rs new file mode 100644 index 0000000..bf4b51d --- /dev/null +++ b/lib/flash-rs/src/mem/ring/mod.rs @@ -0,0 +1,21 @@ +mod comp; +mod error; +mod fill; +mod rx; +mod tx; + +pub(crate) use comp::CompRing; +pub(crate) use fill::FillRing; +pub(crate) use rx::RxRing; +pub(crate) use tx::TxRing; + +pub(crate) trait Prod { + fn needs_wakeup(&self) -> bool; + fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32; + fn submit(&mut self, nb: u32); +} + +pub(crate) trait Cons { + fn peek(&mut self, nb: u32, idx: &mut u32) -> u32; + fn release(&mut self, nb: u32); +} diff --git a/lib/flash-rs/src/mem/ring/rx.rs b/lib/flash-rs/src/mem/ring/rx.rs new file mode 100644 index 0000000..6096950 --- /dev/null +++ b/lib/flash-rs/src/mem/ring/rx.rs @@ -0,0 +1,64 @@ +use std::mem; + +use libc::{XDP_PGOFF_RX_RING, size_t, xdp_ring_offset}; +use libxdp_sys::{ + XSK_RING_CONS__DEFAULT_NUM_DESCS, xdp_desc, xsk_ring_cons, xsk_ring_cons__peek, + xsk_ring_cons__release, xsk_ring_cons__rx_desc, +}; + +use crate::{fd::Fd, mem::Mmap}; + +use super::{Cons, error::RingResult}; + +#[derive(Debug)] +pub(crate) struct RxRing { + ring: xsk_ring_cons, + _mmap: Mmap, +} + +unsafe impl Send for RxRing {} + +impl RxRing { + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + let rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * scale; + + let mmap = fd.mmap( + off.desc as size_t + rx_size as size_t * mem::size_of::(), + XDP_PGOFF_RX_RING, + )?; + + let (prod, cons, ring, flags) = mmap.add_offset(off)?; + + Ok(Self { + ring: xsk_ring_cons { + cached_prod: unsafe { *prod }, + cached_cons: unsafe { *cons }, + mask: rx_size - 1, + size: rx_size, + producer: prod, + consumer: cons, + ring, + flags, + }, + _mmap: mmap, + }) + } + + #[inline] + pub(crate) fn desc(&self, idx: u32) -> Option<&xdp_desc> { + unsafe { xsk_ring_cons__rx_desc(&raw const self.ring, idx).as_ref() } + } +} + +impl Cons for RxRing { + #[inline] + fn peek(&mut self, nb: u32, idx: &mut u32) -> u32 { + unsafe { xsk_ring_cons__peek(&raw mut self.ring, nb, idx) } + } + + #[inline] + fn release(&mut self, nb: u32) { + unsafe { xsk_ring_cons__release(&raw mut self.ring, nb) } + } +} diff --git a/lib/flash-rs/src/mem/ring/tx.rs b/lib/flash-rs/src/mem/ring/tx.rs new file mode 100644 index 0000000..bc2c37c --- /dev/null +++ b/lib/flash-rs/src/mem/ring/tx.rs @@ -0,0 +1,69 @@ +use std::mem; + +use libc::{XDP_PGOFF_TX_RING, size_t, xdp_ring_offset}; +use libxdp_sys::{ + XSK_RING_PROD__DEFAULT_NUM_DESCS, xdp_desc, xsk_ring_prod, xsk_ring_prod__needs_wakeup, + xsk_ring_prod__reserve, xsk_ring_prod__submit, xsk_ring_prod__tx_desc, +}; + +use crate::{fd::Fd, mem::Mmap}; + +use super::{Prod, error::RingResult}; + +#[derive(Debug)] +pub(crate) struct TxRing { + ring: xsk_ring_prod, + _mmap: Mmap, +} + +unsafe impl Send for TxRing {} + +impl TxRing { + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + let tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; + + let mmap = fd.mmap( + off.desc as size_t + tx_size as size_t * mem::size_of::(), + XDP_PGOFF_TX_RING, + )?; + + let (prod, cons, ring, flags) = mmap.add_offset(off)?; + + Ok(Self { + ring: xsk_ring_prod { + cached_prod: unsafe { *prod }, + cached_cons: unsafe { *cons } + tx_size, + mask: tx_size - 1, + size: tx_size, + producer: prod, + consumer: cons, + ring, + flags, + }, + _mmap: mmap, + }) + } + + #[inline] + pub(crate) fn desc(&mut self, idx: u32) -> Option<&mut xdp_desc> { + unsafe { xsk_ring_prod__tx_desc(&raw mut self.ring, idx).as_mut() } + } +} + +impl Prod for TxRing { + #[inline] + fn needs_wakeup(&self) -> bool { + unsafe { xsk_ring_prod__needs_wakeup(&raw const self.ring) != 0 } + } + + #[inline] + fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32 { + unsafe { xsk_ring_prod__reserve(&raw mut self.ring, nb, idx) } + } + + #[inline] + fn submit(&mut self, nb: u32) { + unsafe { xsk_ring_prod__submit(&raw mut self.ring, nb) } + } +} diff --git a/lib/flash-rs/src/mem/umem.rs b/lib/flash-rs/src/mem/umem.rs index 0b249e7..4b008c7 100644 --- a/lib/flash-rs/src/mem/umem.rs +++ b/lib/flash-rs/src/mem/umem.rs @@ -1,32 +1,25 @@ use super::{ + desc::Desc, error::{MemError, MemResult}, mmap::Mmap, }; #[derive(Debug)] -pub(crate) struct Umem { - mmap: Mmap, - pub(crate) scale: u32, - pub(crate) offset: u64, -} +pub(crate) struct Umem(Mmap); impl Umem { - pub(crate) fn new(fd: i32, size: usize, scale: u32, offset: u64) -> MemResult { + pub(crate) fn new(fd: i32, size: usize) -> MemResult { let mmap = Mmap::new(size, fd, 0, false)?; if mmap.is_page_aligned() { - Ok(Self { - mmap, - scale, - offset, - }) + Ok(Self(mmap)) } else { Err(MemError::MmapAlign) } } #[inline] - pub(crate) fn get_data(&mut self, offset: u64, len: usize) -> MemResult<&mut [u8]> { - self.mmap.get_data(offset, len) + pub(crate) fn get_data(&mut self, desc: &Desc) -> MemResult<&mut [u8]> { + self.0.get_data(desc.addr, desc.len as usize) } } diff --git a/lib/flash-rs/src/xsk/stats.rs b/lib/flash-rs/src/stats.rs similarity index 83% rename from lib/flash-rs/src/xsk/stats.rs rename to lib/flash-rs/src/stats.rs index 018e08a..cad7b57 100644 --- a/lib/flash-rs/src/xsk/stats.rs +++ b/lib/flash-rs/src/stats.rs @@ -1,8 +1,9 @@ use std::{cell::UnsafeCell, mem}; -use crate::config::XdpFlags; - -use super::{Fd, error::SocketResult, xdp::XdpStatistics}; +use crate::{ + config::XdpFlags, + fd::{Fd, FdError}, +}; #[derive(Debug)] pub struct Stats { @@ -38,10 +39,10 @@ impl Stats { unsafe { (*self.app.get()).clone() } } - #[allow(clippy::missing_errors_doc)] - pub fn get_xdp_stats(&self) -> SocketResult { + #[allow(clippy::missing_errors_doc, clippy::missing_transmute_annotations)] + pub fn get_xdp_stats(&self) -> Result { let xdp_stats = self.fd.xdp_statistics()?; - Ok(unsafe { mem::transmute::(xdp_stats) }) + Ok(unsafe { mem::transmute::<_, XdpStats>(xdp_stats) }) } } @@ -64,6 +65,8 @@ pub struct AppStats { pub fill_fail_polls: u64, pub tx_copy_sendtos: u64, pub tx_wakeup_sendtos: u64, + pub opt_polls: u64, + pub backpressure: u64, } #[derive(Debug, Default, Clone)] diff --git a/lib/flash-rs/src/uds/client.rs b/lib/flash-rs/src/uds/client.rs index 1b7b2da..3e755fb 100644 --- a/lib/flash-rs/src/uds/client.rs +++ b/lib/flash-rs/src/uds/client.rs @@ -29,18 +29,20 @@ impl UdsClient { #[allow(clippy::similar_names)] pub(crate) fn get_umem( &mut self, - umem_id: u32, - nf_id: u32, + umem_id: u16, + nf_id: u16, ) -> UdsResult<(i32, usize, usize, u32)> { #[repr(C)] struct NfData { - umem_id: u32, - nf_id: u32, + umem_id: i32, + nf_id: i32, } self.conn.write_all(&FLASH_GET_UMEM)?; - self.conn - .write_all(util::as_bytes(&NfData { umem_id, nf_id }))?; + self.conn.write_all(util::as_bytes(&NfData { + umem_id: i32::from(umem_id), + nf_id: i32::from(nf_id), + }))?; let umem_fd = self.conn.recv_fd()?; if umem_fd < 0 { @@ -76,7 +78,9 @@ impl UdsClient { let fd = self.conn.recv_fd()?; let ifqueue = self.conn.recv_i32()?; - if ifqueue < 0 { + if fd < 0 { + Err(UdsError::InvalidSocketFd) + } else if ifqueue < 0 { Err(UdsError::InvalidSocketIfqueue) } else { Ok((fd, ifqueue as u32)) @@ -94,50 +98,30 @@ impl UdsClient { } } - pub(crate) fn get_route_info(&mut self) -> UdsResult> { + pub(crate) fn get_route_info(&mut self) -> UdsResult { self.conn.write_all(&FLASH_GET_ROUTE_INFO)?; let route_size = self.conn.recv_i32()?; if route_size < 0 { - Err(UdsError::InvalidRouteSize) + Err(UdsError::InvalidNextSize) } else { - Ok((0..route_size) - .map(|_| self.conn.recv_i32()) - .collect::, _>>()?) + Ok(route_size as usize) } } pub(crate) fn get_bind_flags(&mut self) -> UdsResult { self.conn.write_all(&FLASH_GET_BIND_FLAGS)?; - let bind_flags = self.conn.recv_i32()?; - - if bind_flags < 0 { - Err(UdsError::InvalidBindFlags) - } else { - Ok(bind_flags as u32) - } + Ok(self.conn.recv_u32()?) } pub(crate) fn get_xdp_flags(&mut self) -> UdsResult { self.conn.write_all(&FLASH_GET_XDP_FLAGS)?; - let xdp_flags = self.conn.recv_i32()?; - - if xdp_flags < 0 { - Err(UdsError::InvalidXdpFlags) - } else { - Ok(xdp_flags as u32) - } + Ok(self.conn.recv_u32()?) } pub(crate) fn get_mode(&mut self) -> UdsResult { self.conn.write_all(&FLASH_GET_MODE)?; - let mode = self.conn.recv_i32()?; - - if mode < 0 { - Err(UdsError::InvalidMode) - } else { - Ok(mode as u32) - } + Ok(self.conn.recv_u32()?) } pub(crate) fn get_poll_timeout(&mut self) -> UdsResult { @@ -165,9 +149,9 @@ impl UdsClient { let dst_size = self.conn.recv_i32()?; if dst_size < 0 { - Err(UdsError::InvalidDstIpSize) + Err(UdsError::InvalidNextSize) } else { - Ok((0..dst_size as usize) + Ok((0..dst_size) .map(|_| self.conn.recv_string::<16>()) .collect::, _>>()?) } diff --git a/lib/flash-rs/src/uds/conn.rs b/lib/flash-rs/src/uds/conn.rs index ed05349..9a580a3 100644 --- a/lib/flash-rs/src/uds/conn.rs +++ b/lib/flash-rs/src/uds/conn.rs @@ -35,6 +35,14 @@ impl UdsConn { Ok(i32::from_ne_bytes(buf)) } + #[inline] + pub(super) fn recv_u32(&mut self) -> io::Result { + let mut buf = [0; 4]; + self.0.read_exact(&mut buf)?; + + Ok(u32::from_ne_bytes(buf)) + } + #[inline] pub(super) fn recv_bool(&mut self) -> io::Result { let mut buf = [0; 1]; diff --git a/lib/flash-rs/src/uds/error.rs b/lib/flash-rs/src/uds/error.rs index 1183e6f..5c5bd7a 100644 --- a/lib/flash-rs/src/uds/error.rs +++ b/lib/flash-rs/src/uds/error.rs @@ -3,21 +3,21 @@ use std::io; pub(super) type UdsResult = Result; #[derive(Debug, thiserror::Error)] +#[error("uds error: {0}")] pub enum UdsError { - #[error("uds error: {0}")] IO(#[from] io::Error), #[error("uds error: invalid bind flags")] InvalidBindFlags, - #[error("uds error: invalid dest ip addr size")] - InvalidDstIpSize, - #[error("uds error: invalid mode")] InvalidMode, - #[error("uds error: invalid route size")] - InvalidRouteSize, + #[error("uds error: invalid next size")] + InvalidNextSize, + + #[error("uds error: invalid socket fd")] + InvalidSocketFd, #[error("uds error: invalid socket ifqueue")] InvalidSocketIfqueue, diff --git a/lib/flash-rs/src/xsk/desc.rs b/lib/flash-rs/src/xsk/desc.rs deleted file mode 100644 index d55b2fe..0000000 --- a/lib/flash-rs/src/xsk/desc.rs +++ /dev/null @@ -1,19 +0,0 @@ -#[derive(Debug)] -pub struct Desc { - pub(super) addr: u64, - pub(super) len: u32, - pub(super) options: u32, -} - -impl Desc { - #[inline] - pub fn len(&self) -> usize { - self.len as usize - } - - #[allow(clippy::cast_possible_truncation)] - #[inline] - pub fn set_next(&mut self, idx: usize) { - self.options = (self.options & 0xFFFF) | ((idx as u32) << 16); - } -} diff --git a/lib/flash-rs/src/xsk/error.rs b/lib/flash-rs/src/xsk/error.rs index b7ba6d4..0ed082e 100644 --- a/lib/flash-rs/src/xsk/error.rs +++ b/lib/flash-rs/src/xsk/error.rs @@ -1,36 +1,17 @@ use std::io; -use crate::mem::MemError; +use crate::{fd::FdError, mem::MemError, uds::UdsError}; pub(super) type SocketResult = Result; #[derive(Debug, thiserror::Error)] +#[error("xsk error: {0}")] pub enum SocketError { - #[error("xsk error: {0}")] IO(#[from] io::Error), - - #[error("xsk error: {0}")] Mem(#[from] MemError), - - #[error("xsk error: invalid file descriptor")] - InvalidFd, - - #[error("xsk error: optlen does not match struct size")] - SockOptSize, - - #[error("xsk error: could not populate fq")] - FqPopulate, - - #[error("xsk error: received more than batch size")] - BatchOverflow, + Fd(#[from] FdError), + Uds(#[from] UdsError), #[error("xsk error: size exceeds buffer length")] SizeOverflow, } - -impl SocketError { - #[inline] - pub(crate) fn last_os_error() -> Self { - SocketError::IO(io::Error::last_os_error()) - } -} diff --git a/lib/flash-rs/src/xsk/mod.rs b/lib/flash-rs/src/xsk/mod.rs index ad079bf..04f9e9b 100644 --- a/lib/flash-rs/src/xsk/mod.rs +++ b/lib/flash-rs/src/xsk/mod.rs @@ -1,19 +1,8 @@ -mod desc; mod error; -mod fd; -mod ring; mod shared; mod socket; -mod xdp; -#[cfg(feature = "stats")] -mod stats; - -pub(crate) use fd::Fd; pub(crate) use shared::SocketShared; pub use error::SocketError; pub use socket::Socket; - -#[cfg(feature = "stats")] -pub use stats::Stats; diff --git a/lib/flash-rs/src/xsk/ring.rs b/lib/flash-rs/src/xsk/ring.rs deleted file mode 100644 index 701cc3c..0000000 --- a/lib/flash-rs/src/xsk/ring.rs +++ /dev/null @@ -1,249 +0,0 @@ -use std::mem; - -use libc::{ - XDP_PGOFF_RX_RING, XDP_PGOFF_TX_RING, XDP_UMEM_PGOFF_COMPLETION_RING, XDP_UMEM_PGOFF_FILL_RING, - size_t, xdp_ring_offset, -}; -use libxdp_sys::{ - XSK_RING_CONS__DEFAULT_NUM_DESCS, XSK_RING_PROD__DEFAULT_NUM_DESCS, xdp_desc, xsk_ring_cons, - xsk_ring_cons__comp_addr, xsk_ring_cons__peek, xsk_ring_cons__release, xsk_ring_cons__rx_desc, - xsk_ring_prod, xsk_ring_prod__fill_addr, xsk_ring_prod__needs_wakeup, xsk_ring_prod__reserve, - xsk_ring_prod__submit, xsk_ring_prod__tx_desc, -}; - -use crate::mem::Mmap; - -use super::{error::SocketResult, fd::Fd}; - -pub(super) trait Prod { - fn needs_wakeup(&self) -> bool; - fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32; - fn submit(&mut self, nb: u32); -} - -pub(super) trait Cons { - fn peek(&mut self, nb: u32, idx: &mut u32) -> u32; - fn release(&mut self, nb: u32); -} - -#[derive(Debug)] -pub(super) struct FillRing { - ring: xsk_ring_prod, - _mmap: Mmap, -} - -unsafe impl Send for FillRing {} - -impl FillRing { - #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(super) fn new(fd: &Fd, off: &xdp_ring_offset, umem_scale: u32) -> SocketResult { - let fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2 * umem_scale; - - let mmap = fd.mmap( - off.desc as size_t + fill_size as size_t * mem::size_of::(), - XDP_UMEM_PGOFF_FILL_RING as _, - )?; - - let (prod, cons, ring, flags) = mmap.add_offset(off)?; - - Ok(Self { - ring: xsk_ring_prod { - cached_prod: 0, - cached_cons: fill_size, - mask: fill_size - 1, - size: fill_size, - producer: prod, - consumer: cons, - ring, - flags, - }, - _mmap: mmap, - }) - } - - #[inline] - pub(super) fn addr(&mut self, idx: u32) -> Option<&mut u64> { - unsafe { xsk_ring_prod__fill_addr(&mut self.ring, idx).as_mut() } - } -} - -impl Prod for FillRing { - #[inline] - fn needs_wakeup(&self) -> bool { - unsafe { xsk_ring_prod__needs_wakeup(&self.ring) != 0 } - } - - #[inline] - fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32 { - unsafe { xsk_ring_prod__reserve(&mut self.ring, nb, idx) } - } - - #[inline] - fn submit(&mut self, nb: u32) { - unsafe { xsk_ring_prod__submit(&mut self.ring, nb) } - } -} - -#[derive(Debug)] -pub(super) struct TxRing { - ring: xsk_ring_prod, - _mmap: Mmap, -} - -unsafe impl Send for TxRing {} - -impl TxRing { - #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(super) fn new(fd: &Fd, off: &xdp_ring_offset, umem_scale: u32) -> SocketResult { - let tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * umem_scale; - - let mmap = fd.mmap( - off.desc as size_t + tx_size as size_t * mem::size_of::(), - XDP_PGOFF_TX_RING, - )?; - - let (prod, cons, ring, flags) = mmap.add_offset(off)?; - - Ok(Self { - ring: xsk_ring_prod { - cached_prod: unsafe { *prod }, - cached_cons: unsafe { *cons } + tx_size, - mask: tx_size - 1, - size: tx_size, - producer: prod, - consumer: cons, - ring, - flags, - }, - _mmap: mmap, - }) - } - - #[inline] - pub(super) fn desc(&mut self, idx: u32) -> Option<&mut xdp_desc> { - unsafe { xsk_ring_prod__tx_desc(&mut self.ring, idx).as_mut() } - } -} - -impl Prod for TxRing { - #[inline] - fn needs_wakeup(&self) -> bool { - unsafe { xsk_ring_prod__needs_wakeup(&self.ring) != 0 } - } - - #[inline] - fn reserve(&mut self, nb: u32, idx: &mut u32) -> u32 { - unsafe { xsk_ring_prod__reserve(&mut self.ring, nb, idx) } - } - - #[inline] - fn submit(&mut self, nb: u32) { - unsafe { xsk_ring_prod__submit(&mut self.ring, nb) } - } -} - -#[derive(Debug)] -pub(super) struct CompRing { - ring: xsk_ring_cons, - _mmap: Mmap, -} - -unsafe impl Send for CompRing {} - -impl CompRing { - #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(super) fn new(fd: &Fd, off: &xdp_ring_offset, umem_scale: u32) -> SocketResult { - let comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * umem_scale; - - let mmap = fd.mmap( - off.desc as size_t + comp_size as size_t * mem::size_of::(), - XDP_UMEM_PGOFF_COMPLETION_RING as _, - )?; - - let (prod, cons, ring, flags) = mmap.add_offset(off)?; - - Ok(Self { - ring: xsk_ring_cons { - cached_prod: 0, - cached_cons: 0, - mask: comp_size - 1, - size: comp_size, - producer: prod, - consumer: cons, - ring, - flags, - }, - _mmap: mmap, - }) - } - - #[inline] - pub(super) fn addr(&self, idx: u32) -> Option<&u64> { - unsafe { xsk_ring_cons__comp_addr(&self.ring, idx).as_ref() } - } -} - -impl Cons for CompRing { - #[inline] - fn peek(&mut self, nb: u32, idx: &mut u32) -> u32 { - unsafe { xsk_ring_cons__peek(&mut self.ring, nb, idx) } - } - - #[inline] - fn release(&mut self, nb: u32) { - unsafe { xsk_ring_cons__release(&mut self.ring, nb) } - } -} - -#[derive(Debug)] -pub(super) struct RxRing { - ring: xsk_ring_cons, - _mmap: Mmap, -} - -unsafe impl Send for RxRing {} - -impl RxRing { - #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(super) fn new(fd: &Fd, off: &xdp_ring_offset, umem_scale: u32) -> SocketResult { - let rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * umem_scale; - - let mmap = fd.mmap( - off.desc as size_t + rx_size as size_t * mem::size_of::(), - XDP_PGOFF_RX_RING, - )?; - - let (prod, cons, ring, flags) = mmap.add_offset(off)?; - - Ok(Self { - ring: xsk_ring_cons { - cached_prod: unsafe { *prod }, - cached_cons: unsafe { *cons }, - mask: rx_size - 1, - size: rx_size, - producer: prod, - consumer: cons, - ring, - flags, - }, - _mmap: mmap, - }) - } - - #[inline] - pub(super) fn desc(&self, idx: u32) -> Option<&xdp_desc> { - unsafe { xsk_ring_cons__rx_desc(&self.ring, idx).as_ref() } - } -} - -impl Cons for RxRing { - #[inline] - fn peek(&mut self, nb: u32, idx: &mut u32) -> u32 { - unsafe { xsk_ring_cons__peek(&mut self.ring, nb, idx) } - } - - #[inline] - fn release(&mut self, nb: u32) { - unsafe { xsk_ring_cons__release(&mut self.ring, nb) } - } -} diff --git a/lib/flash-rs/src/xsk/socket.rs b/lib/flash-rs/src/xsk/socket.rs index 7f9d2df..d332c02 100644 --- a/lib/flash-rs/src/xsk/socket.rs +++ b/lib/flash-rs/src/xsk/socket.rs @@ -1,116 +1,105 @@ use std::{io, sync::Arc, thread}; -use libc::{EAGAIN, EBUSY, ENETDOWN, ENOBUFS}; -use libxdp_sys::{ - XSK_RING_PROD__DEFAULT_NUM_DESCS, XSK_UMEM__DEFAULT_FRAME_SIZE, xsk_umem__add_offset_to_addr, - xsk_umem__extract_addr, -}; use quanta::{Clock, Instant}; use crate::{ config::{BindFlags, Mode}, - mem::Umem, - util, -}; - -use super::{ - desc::Desc, - error::SocketError, - error::SocketResult, fd::Fd, - ring::{CompRing, Cons as _, FillRing, Prod as _, RxRing, TxRing}, - shared::SocketShared, + mem::{CompRing, Cons as _, Desc, FillRing, Prod as _, RxRing, TxRing, Umem}, }; +#[cfg(feature = "pool")] +use crate::mem::Pool; + #[cfg(feature = "stats")] -use super::stats::Stats; +use crate::stats::Stats; -const FRAME_SIZE: u64 = XSK_UMEM__DEFAULT_FRAME_SIZE as u64; +use super::{ + error::{SocketError, SocketResult}, + shared::SocketShared, +}; #[derive(Debug)] pub struct Socket { fd: Fd, - rx: RxRing, - tx: TxRing, + umem: Umem, fill: FillRing, comp: CompRing, + rx: RxRing, + tx: TxRing, + + #[cfg(feature = "pool")] + pool: Pool, + outstanding_tx: u32, clock: Clock, idle_timestamp: Option, - umem: Umem, + shared: Arc, + #[cfg(feature = "stats")] stats: Arc, - shared: Arc, } impl Socket { pub(crate) fn new( fd: Fd, umem: Umem, + idx: usize, + umem_scale: u32, + umem_offset: u64, #[cfg(feature = "stats")] stats: Stats, data: Arc, ) -> SocketResult { let off = fd.xdp_mmap_offsets()?; + let mut fill = FillRing::new(&fd, off.fr(), umem_scale)?; + let comp = CompRing::new(&fd, off.cr(), umem_scale)?; + let rx = RxRing::new(&fd, off.rx(), umem_scale)?; + let tx = TxRing::new(&fd, off.tx(), umem_scale)?; + + #[cfg(feature = "pool")] + fill.populate(umem_scale, idx as u64 + umem_offset)?; + + #[cfg(not(feature = "pool"))] + fill.populate(2 * umem_scale, idx as u64 + umem_offset)?; + Ok(Self { - rx: RxRing::new(&fd, off.rx(), umem.scale)?, - tx: TxRing::new(&fd, off.tx(), umem.scale)?, - comp: CompRing::new(&fd, off.cr(), umem.scale)?, - fill: FillRing::new(&fd, off.fr(), umem.scale)?, fd, - outstanding_tx: 0, - clock: Clock::new(), - idle_timestamp: None, umem, - #[cfg(feature = "stats")] - stats: Arc::new(stats), - shared: data, - }) - } + rx, + tx, + comp, + fill, - pub(crate) fn populate_fq(&mut self, idx: usize) -> SocketResult<()> { - let nr_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2 * self.umem.scale; - let offset = idx as u64 + self.umem.offset; + #[cfg(feature = "pool")] + pool: Pool::new(umem_scale, idx as u64 + umem_offset), - let mut idx_fq = 0; - if self.fill.reserve(nr_frames, &mut idx_fq) != nr_frames { - return Err(SocketError::FqPopulate); - } + outstanding_tx: 0, - for i in 0..u64::from(nr_frames) { - if let Some(fill_addr) = self.fill.addr(idx_fq) { - *fill_addr = (offset + i) * FRAME_SIZE; - } + clock: Clock::new(), + idle_timestamp: None, - idx_fq += 1; - } + shared: data, - self.fill.submit(nr_frames); - Ok(()) + #[cfg(feature = "stats")] + stats: Arc::new(stats), + }) } #[allow(clippy::missing_errors_doc)] #[inline] pub fn poll(&mut self) -> io::Result { if self.shared.xsk_config.mode.contains(Mode::FLASH_POLL) { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).opt_polls += 1; + } self.fd.poll() } else { Ok(true) } } - #[inline] - fn kick_tx(&self) -> Result<(), ()> { - if self.fd.kick() >= 0 { - Ok(()) - } else { - match util::get_errno() { - ENOBUFS | EAGAIN | EBUSY | ENETDOWN => Ok(()), - _ => Err(()), - } - } - } - #[allow(clippy::similar_names)] #[inline] fn complete_tx_rx(&mut self) { @@ -128,42 +117,54 @@ impl Socket { unsafe { (*self.stats.app.get()).tx_copy_sendtos += 1; } - let _ = self.kick_tx(); + let _ = self.fd.kick(); } let num_outstanding = self.outstanding_tx.min(self.shared.xsk_config.batch_size); let mut idx_cq = 0; - let mut idx_fq = 0; let completed = self.comp.peek(num_outstanding, &mut idx_cq); if completed == 0 { return; } - while self.fill.reserve(completed, &mut idx_fq) != completed { - if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) - || self.fill.needs_wakeup() - { - #[cfg(feature = "stats")] - unsafe { - (*self.stats.app.get()).fill_fail_polls += 1; - } - self.fd.wakeup(); + #[cfg(feature = "pool")] + for _ in 0..completed { + if let Some(comp_addr) = self.comp.addr(idx_cq) { + self.pool.put(*comp_addr); } + idx_cq += 1; } - for _ in 0..completed { - if let Some(fill_addr) = self.fill.addr(idx_fq) { - if let Some(comp_addr) = self.comp.addr(idx_cq) { - *fill_addr = *comp_addr; + #[cfg(not(feature = "pool"))] + { + let mut idx_fq = 0; + while self.fill.reserve(completed, &mut idx_fq) != completed { + if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) + || self.fill.needs_wakeup() + { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).fill_fail_polls += 1; + } + self.fd.wakeup(); } } - idx_fq += 1; - idx_cq += 1; + for _ in 0..completed { + if let Some(fill_addr) = self.fill.addr(idx_fq) { + if let Some(comp_addr) = self.comp.addr(idx_cq) { + *fill_addr = *comp_addr; + } + } + + idx_fq += 1; + idx_cq += 1; + } + + self.fill.submit(completed); } - self.fill.submit(completed); self.comp.release(completed); self.outstanding_tx -= completed; } @@ -190,7 +191,6 @@ impl Socket { #[inline] fn reserve_tx(&mut self, num: u32) -> u32 { let mut idx_tx = 0; - if self.tx.reserve(num, &mut idx_tx) == num { return idx_tx; } @@ -204,7 +204,7 @@ impl Socket { unsafe { (*self.stats.app.get()).tx_wakeup_sendtos += 1; } - let _ = self.kick_tx(); + let _ = self.fd.kick(); } if self.tx.reserve(num, &mut idx_tx) == num { @@ -214,11 +214,53 @@ impl Socket { if let Some(poll_config) = &self.shared.poll_config { if self.outstanding_tx >= poll_config.bp_threshold { thread::sleep(poll_config.bp_timeout); + + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).backpressure += 1; + } } } } } + #[cfg(feature = "pool")] + #[inline] + fn replenish_fq(&mut self, num: u32) { + let mut idx_fq = self.reserve_fq(num); + let mut allocated = 0; + + while allocated < num { + if let Some(addr) = self.pool.get() { + if let Some(fill_addr) = self.fill.addr(idx_fq) { + *fill_addr = addr; + allocated += 1; + } else { + self.pool.put(addr); + + #[cfg(feature = "tracing")] + tracing::warn!("xsk: failed to get fill descriptor"); + } + + idx_fq += 1; + } else { + self.complete_tx_rx(); + + if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) + || self.tx.needs_wakeup() + { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).tx_wakeup_sendtos += 1; + } + let _ = self.fd.kick(); + } + } + } + + self.fill.submit(num); + } + #[allow(clippy::missing_errors_doc)] pub fn recv(&mut self) -> SocketResult> { self.complete_tx_rx(); @@ -257,28 +299,23 @@ impl Socket { } if let Some(poll_config) = &self.shared.poll_config { - if rcvd >= poll_config.idle_threshold { + if rcvd >= poll_config.idle_threshold || self.outstanding_tx > 0 { self.idle_timestamp = None; } } + #[cfg(feature = "tracing")] if rcvd > self.shared.xsk_config.batch_size { - return Err(SocketError::BatchOverflow); + tracing::warn!("xsk: received more descriptors than batch size"); } let descs = (0..rcvd) - .filter_map(|_| { - let desc = self.rx.desc(idx_rx); - idx_rx += 1; - - desc.map(|desc| Desc { - addr: unsafe { xsk_umem__add_offset_to_addr(desc.addr) }, - len: desc.len, - options: desc.options, - }) - }) + .filter_map(|i| self.rx.desc(idx_rx + i).map(Into::into)) .collect(); + #[cfg(feature = "pool")] + self.replenish_fq(rcvd); + self.rx.release(rcvd); #[cfg(feature = "stats")] @@ -289,29 +326,60 @@ impl Socket { Ok(descs) } + #[cfg(feature = "pool")] + #[allow(clippy::missing_errors_doc)] + pub fn alloc(&mut self, num: usize) -> SocketResult> { + if num == 0 { + return Ok(vec![]); + } + + let mut descs = Vec::with_capacity(num); + while descs.len() < num { + if let Some(addr) = self.pool.get() { + descs.push(addr.into()); + } else { + self.complete_tx_rx(); + + if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) + || self.tx.needs_wakeup() + { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).tx_wakeup_sendtos += 1; + } + let _ = self.fd.kick(); + } + } + } + + Ok(descs) + } + #[allow(clippy::cast_possible_truncation)] pub fn send(&mut self, descs: Vec) { - if descs.is_empty() { + let n = descs.len() as u32; + if n == 0 { return; } - let n = descs.len() as u32; let mut idx_tx = self.reserve_tx(n); - for desc in descs { - let tx_desc = self.tx.desc(idx_tx); - idx_tx += 1; - - if let Some(tx_desc) = tx_desc { - tx_desc.addr = desc.addr; - tx_desc.len = desc.len; - tx_desc.options = desc.options & 0xFFFF_0000; + if let Some(tx_desc) = self.tx.desc(idx_tx) { + desc.copy_to(tx_desc); + } else { + #[cfg(feature = "tracing")] + tracing::warn!("xsk: failed to get tx descriptor"); } + + idx_tx += 1; } self.tx.submit(n); self.outstanding_tx += n; + #[cfg(feature = "pool")] + self.complete_tx_rx(); + #[cfg(feature = "stats")] unsafe { (*self.stats.ring.get()).tx += u64::from(n); @@ -320,19 +388,27 @@ impl Socket { #[allow(clippy::cast_possible_truncation)] pub fn drop(&mut self, descs: Vec) { - if descs.is_empty() { + let n = descs.len() as u32; + if n == 0 { return; } - let n = descs.len() as u32; - let mut idx_fq = self.reserve_fq(n); + #[cfg(feature = "pool")] + self.pool.extend(descs.into_iter().map(Desc::extract_addr)); - for desc in descs { - if let Some(fill_addr) = self.fill.addr(idx_fq) { - *fill_addr = unsafe { xsk_umem__extract_addr(desc.addr) }; - } + #[cfg(not(feature = "pool"))] + { + let mut idx_fq = self.reserve_fq(n); + for desc in descs { + if let Some(fill_addr) = self.fill.addr(idx_fq) { + *fill_addr = desc.extract_addr(); + } else { + #[cfg(feature = "tracing")] + tracing::warn!("xsk: failed to get fill descriptor"); + } - idx_fq += 1; + idx_fq += 1; + } } self.fill.submit(n); @@ -346,17 +422,17 @@ impl Socket { #[allow(clippy::missing_errors_doc)] #[inline] pub fn read(&mut self, desc: &Desc) -> SocketResult<&mut [u8]> { - Ok(self.umem.get_data(desc.addr, desc.len as usize)?) + Ok(self.umem.get_data(desc)?) } #[allow(clippy::missing_errors_doc)] #[inline] pub fn read_exact(&mut self, desc: &Desc) -> SocketResult<&mut [u8; SIZE]> { - if SIZE > desc.len as usize { + if SIZE > desc.len() { Err(SocketError::SizeOverflow) } else { unsafe { - let data = self.umem.get_data(desc.addr, desc.len as usize)?; + let data = self.umem.get_data(desc)?; Ok(&mut *data.as_mut_ptr().cast::<[u8; SIZE]>()) } } From 9d6ebed3a68dc2cf7602f12583dd34631f2ba743 Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Fri, 11 Jul 2025 03:33:49 +0800 Subject: [PATCH 16/43] feat(rust): added arpresolver-rs --- Cargo.toml | 1 + examples/arpresolver-rs/Cargo.toml | 20 ++++++ examples/arpresolver-rs/build.rs | 3 + examples/arpresolver-rs/src/cli.rs | 28 ++++++++ examples/arpresolver-rs/src/main.rs | 103 ++++++++++++++++++++++++++++ examples/arpresolver-rs/src/nf.rs | 47 +++++++++++++ 6 files changed, 202 insertions(+) create mode 100644 examples/arpresolver-rs/Cargo.toml create mode 100644 examples/arpresolver-rs/build.rs create mode 100644 examples/arpresolver-rs/src/cli.rs create mode 100644 examples/arpresolver-rs/src/main.rs create mode 100644 examples/arpresolver-rs/src/nf.rs diff --git a/Cargo.toml b/Cargo.toml index 50c0d69..1035a6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "lib/flash-rs", + "examples/arpresolver-rs", "examples/firewall-rs", "examples/helloworld-rs", "examples/ip4ping-rs", diff --git a/examples/arpresolver-rs/Cargo.toml b/examples/arpresolver-rs/Cargo.toml new file mode 100644 index 0000000..52ee173 --- /dev/null +++ b/examples/arpresolver-rs/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "arpresolver" +version = "0.1.0" +edition = "2024" + +[dependencies] +clap = { version = "4.5.35", features = ["derive"] } +core_affinity = "0.8.3" +ctrlc = "3.4.5" +flash = { path = "../../lib/flash-rs", features = ["clap"] } +macaddr = "1.0.1" +tracing = { version = "0.1.41", optional = true } +tracing-subscriber = { version = "0.3.19", optional = true } + +[features] +default = [] +tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] + +[lints.rust] +unsafe_code = "forbid" diff --git a/examples/arpresolver-rs/build.rs b/examples/arpresolver-rs/build.rs new file mode 100644 index 0000000..6754864 --- /dev/null +++ b/examples/arpresolver-rs/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rustc-link-lib=xdp"); +} diff --git a/examples/arpresolver-rs/src/cli.rs b/examples/arpresolver-rs/src/cli.rs new file mode 100644 index 0000000..c8dbe04 --- /dev/null +++ b/examples/arpresolver-rs/src/cli.rs @@ -0,0 +1,28 @@ +use clap::Parser; +use flash::FlashConfig; +use macaddr::MacAddr6; + +#[derive(Debug, Parser)] +pub struct Cli { + #[command(flatten)] + pub flash_config: FlashConfig, + + #[arg( + short = 'c', + long, + default_value_t = 0, + help = "Starting CPU core index for socket threads" + )] + pub cpu_start: usize, + + #[arg( + short = 'e', + long, + default_value_t = 0, + help = "Ending CPU core index for socket threads (inclusive)" + )] + pub cpu_end: usize, + + #[arg(short = 'm', long, help = "Interface MAC address")] + pub mac_addr: MacAddr6, +} diff --git a/examples/arpresolver-rs/src/main.rs b/examples/arpresolver-rs/src/main.rs new file mode 100644 index 0000000..cb6ee75 --- /dev/null +++ b/examples/arpresolver-rs/src/main.rs @@ -0,0 +1,103 @@ +mod cli; +mod nf; + +use std::{ + net::Ipv4Addr, + sync::{ + Arc, + atomic::{AtomicBool, Ordering}, + }, + thread, +}; + +use clap::Parser; +use flash::Socket; +use macaddr::MacAddr6; + +use crate::cli::Cli; + +fn socket_thread(mut socket: Socket, mac_addr: MacAddr6, ip_addr: Ipv4Addr, run: &Arc) { + while run.load(Ordering::SeqCst) { + if !socket.poll().is_ok_and(|val| val) { + continue; + } + + let Ok(descs) = socket.recv() else { + continue; + }; + + let (descs_send, descs_drop) = descs.into_iter().partition(|desc| { + socket + .read_exact(desc) + .is_ok_and(|pkt| nf::arp_resolve(pkt, mac_addr, ip_addr)) + }); + + socket.send(descs_send); + socket.drop(descs_drop); + } +} + +fn main() { + #[cfg(feature = "tracing")] + tracing_subscriber::fmt::init(); + + let cli = Cli::parse(); + + let (sockets, route) = match flash::connect(&cli.flash_config) { + Ok(t) => t, + Err(err) => { + eprintln!("{err}"); + return; + } + }; + + if sockets.is_empty() { + eprintln!("no sockets received"); + return; + } + + #[cfg(feature = "tracing")] + tracing::debug!("Sockets: {:?}", sockets); + + let cores = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .filter(|core_id| core_id.id >= cli.cpu_start && core_id.id <= cli.cpu_end) + .collect::>(); + + if cores.is_empty() { + eprintln!("No cores found in range {}-{}", cli.cpu_start, cli.cpu_end); + return; + } + + #[cfg(feature = "tracing")] + tracing::debug!("Cores: {:?}", cores); + + let run = Arc::new(AtomicBool::new(true)); + + let r = run.clone(); + if let Err(err) = ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) { + eprintln!("error setting Ctrl-C handler: {err}"); + return; + } + + let handles = sockets + .into_iter() + .zip(cores.into_iter().cycle()) + .map(|(socket, core_id)| { + let r = run.clone(); + thread::spawn(move || { + core_affinity::set_for_current(core_id); + socket_thread(socket, cli.mac_addr, route.ip_addr, &r); + }) + }) + .collect::>(); + + for handle in handles { + if let Err(err) = handle.join() { + eprintln!("error in thread: {err:?}"); + } + } +} diff --git a/examples/arpresolver-rs/src/nf.rs b/examples/arpresolver-rs/src/nf.rs new file mode 100644 index 0000000..15e2c95 --- /dev/null +++ b/examples/arpresolver-rs/src/nf.rs @@ -0,0 +1,47 @@ +use std::net::Ipv4Addr; + +use macaddr::MacAddr6; + +const ETHER_TYPE_ARP: u16 = 0x0806; + +const ARP_HTYPE_ETHERNET: u16 = 0x0001; +const ARP_PTYPE_IPV4: u16 = 0x0800; +const ARP_HLEN_ETHERNET: u8 = 6; +const ARP_PLEN_IPV4: u8 = 4; +const ARP_OPCODE_REQUEST: u16 = 1; +const ARP_OPCODE_REPLY: u16 = 2; + +#[forbid(clippy::indexing_slicing)] +#[inline] +pub fn arp_resolve(pkt: &mut [u8; 42], mac_addr: MacAddr6, ip_addr: Ipv4Addr) -> bool { + if u16::from_be_bytes([pkt[12], pkt[13]]) != ETHER_TYPE_ARP + || u16::from_be_bytes([pkt[14], pkt[15]]) != ARP_HTYPE_ETHERNET + || u16::from_be_bytes([pkt[16], pkt[17]]) != ARP_PTYPE_IPV4 + || pkt[18] != ARP_HLEN_ETHERNET + || pkt[19] != ARP_PLEN_IPV4 + || u16::from_be_bytes([pkt[20], pkt[21]]) != ARP_OPCODE_REQUEST + { + return false; + } + + if pkt[38..42] != ip_addr.octets() { + return false; + } + + let mut tmp = [0u8; 6]; + + tmp.copy_from_slice(&pkt[6..12]); + pkt[0..6].copy_from_slice(&tmp); + pkt[32..38].copy_from_slice(&tmp); + + pkt[6..12].copy_from_slice(&mac_addr.into_array()); + pkt[22..28].copy_from_slice(&mac_addr.into_array()); + + pkt[20..22].copy_from_slice(&ARP_OPCODE_REPLY.to_be_bytes()); + + tmp[0..4].copy_from_slice(&pkt[28..32]); + pkt[38..42].swap_with_slice(&mut tmp[0..4]); + pkt[28..32].copy_from_slice(&tmp[0..4]); + + true +} From 4dd07e6da4e02d6eccb5f726dfce0f1b6e4ce4f6 Mon Sep 17 00:00:00 2001 From: Sameer Ahmad Date: Sat, 30 Aug 2025 02:13:07 +0530 Subject: [PATCH 17/43] fix: compilation errors on arm64 --- examples/mica/ported-mica/table.h | 395 +++++++++++--------------- examples/unit-tests/userspace-chain.c | 7 + 2 files changed, 174 insertions(+), 228 deletions(-) diff --git a/examples/mica/ported-mica/table.h b/examples/mica/ported-mica/table.h index dcb49c6..a54a48f 100644 --- a/examples/mica/ported-mica/table.h +++ b/examples/mica/ported-mica/table.h @@ -19,14 +19,12 @@ #include "alloc_malloc.h" #include "alloc_dynamic.h" #include "shm.h" -#include MEHCACHED_BEGIN #define MEHCACHED_MAX_KEY_LENGTH (255) #define MEHCACHED_MAX_VALUE_LENGTH (1048575) - #ifndef MEHCACHED_NO_EVICTION // #define MEHCACHED_ITEMS_PER_BUCKET (7) #define MEHCACHED_ITEMS_PER_BUCKET (15) @@ -43,318 +41,259 @@ MEHCACHED_BEGIN #define MEHCACHED_SINGLE_ALLOC #ifdef MEHCACHED_COLLECT_STATS -#define MEHCACHED_STAT_INC(table, name) do { __sync_add_and_fetch(&(table)->stats.name, 1); } while (0) -#define MEHCACHED_STAT_DEC(table, name) do { __sync_sub_and_fetch(&(table)->stats.name, 1); } while (0) +#define MEHCACHED_STAT_INC(table, name) \ + do { \ + __sync_add_and_fetch(&(table)->stats.name, 1); \ + } while (0) +#define MEHCACHED_STAT_DEC(table, name) \ + do { \ + __sync_sub_and_fetch(&(table)->stats.name, 1); \ + } while (0) #else -#define MEHCACHED_STAT_INC(table, name) do { (void)table; } while (0) -#define MEHCACHED_STAT_DEC(table, name) do { (void)table; } while (0) +#define MEHCACHED_STAT_INC(table, name) \ + do { \ + (void)table; \ + } while (0) +#define MEHCACHED_STAT_DEC(table, name) \ + do { \ + (void)table; \ + } while (0) #endif -typedef enum _MEHCACHED_RESULT -{ - MEHCACHED_OK = 0, - MEHCACHED_ERROR, - MEHCACHED_FULL, - MEHCACHED_EXIST, - MEHCACHED_NOT_FOUND, - MEHCACHED_PARTIAL_VALUE, - MEHCACHED_NOT_PROCESSED, +typedef enum _MEHCACHED_RESULT { + MEHCACHED_OK = 0, + MEHCACHED_ERROR, + MEHCACHED_FULL, + MEHCACHED_EXIST, + MEHCACHED_NOT_FOUND, + MEHCACHED_PARTIAL_VALUE, + MEHCACHED_NOT_PROCESSED, } MEHCACHED_RESULT; -struct mehcached_bucket -{ - uint32_t version; // XXX: is uint32_t wide enough? - uint32_t next_extra_bucket_index; // 1-base; 0 = no extra bucket - uint64_t item_vec[MEHCACHED_ITEMS_PER_BUCKET]; +struct mehcached_bucket { + uint32_t version; // XXX: is uint32_t wide enough? + uint32_t next_extra_bucket_index; // 1-base; 0 = no extra bucket + uint64_t item_vec[MEHCACHED_ITEMS_PER_BUCKET]; - // 16: tag (1-base) - // 8: alloc id - // 40: item offset - // item == 0: empty item + // 16: tag (1-base) + // 8: alloc id + // 40: item offset + // item == 0: empty item - #define MEHCACHED_TAG_MASK (((uint64_t)1 << 16) - 1) - #define MEHCACHED_TAG(item_vec) ((item_vec) >> 48) +#define MEHCACHED_TAG_MASK (((uint64_t)1 << 16) - 1) +#define MEHCACHED_TAG(item_vec) ((item_vec) >> 48) #ifndef MEHCACHED_SINGLE_ALLOC - #define MEHCACHED_ALLOC_ID_MASK (((uint64_t)1 << 8) - 1) - #define MEHCACHED_ALLOC_ID(item_vec) (((item_vec) >> 40) & MEHCACHED_ALLOC_ID_MASK) +#define MEHCACHED_ALLOC_ID_MASK (((uint64_t)1 << 8) - 1) +#define MEHCACHED_ALLOC_ID(item_vec) (((item_vec) >> 40) & MEHCACHED_ALLOC_ID_MASK) #else - #define MEHCACHED_ALLOC_ID(item_vec) (0LU) +#define MEHCACHED_ALLOC_ID(item_vec) (0LU) #endif - - #define MEHCACHED_ITEM_OFFSET(item_vec) ((item_vec) & MEHCACHED_ITEM_OFFSET_MASK) +#define MEHCACHED_ITEM_OFFSET(item_vec) ((item_vec) & MEHCACHED_ITEM_OFFSET_MASK) #ifndef MEHCACHED_SINGLE_ALLOC - #define MEHCACHED_ITEM_VEC(tag, alloc_id, item_offset) (((uint64_t)(tag) << 48) | ((uint64_t)(alloc_id) << 40) | (uint64_t)(item_offset)) +#define MEHCACHED_ITEM_VEC(tag, alloc_id, item_offset) \ + (((uint64_t)(tag) << 48) | ((uint64_t)(alloc_id) << 40) | (uint64_t)(item_offset)) #else - #define MEHCACHED_ITEM_VEC(tag, alloc_id, item_offset) (((uint64_t)(tag) << 48) | (uint64_t)(item_offset)) +#define MEHCACHED_ITEM_VEC(tag, alloc_id, item_offset) (((uint64_t)(tag) << 48) | (uint64_t)(item_offset)) #endif }; -struct mehcached_item -{ - struct mehcached_alloc_item alloc_item; +struct mehcached_item { + struct mehcached_alloc_item alloc_item; - uint32_t kv_length_vec; // key_length: 8, value_length: 24; kv_length_vec == 0: empty item + uint32_t kv_length_vec; // key_length: 8, value_length: 24; kv_length_vec == 0: empty item - #define MEHCACHED_KEY_MASK (((uint32_t)1 << 8) - 1) - #define MEHCACHED_KEY_LENGTH(kv_length_vec) ((kv_length_vec) >> 24) +#define MEHCACHED_KEY_MASK (((uint32_t)1 << 8) - 1) +#define MEHCACHED_KEY_LENGTH(kv_length_vec) ((kv_length_vec) >> 24) - #define MEHCACHED_VALUE_MASK (((uint32_t)1 << 24) - 1) - #define MEHCACHED_VALUE_LENGTH(kv_length_vec) ((kv_length_vec) & MEHCACHED_VALUE_MASK) +#define MEHCACHED_VALUE_MASK (((uint32_t)1 << 24) - 1) +#define MEHCACHED_VALUE_LENGTH(kv_length_vec) ((kv_length_vec) & MEHCACHED_VALUE_MASK) - #define MEHCACHED_KV_LENGTH_VEC(key_length, value_length) (((uint32_t)(key_length) << 24) | (uint32_t)(value_length)) +#define MEHCACHED_KV_LENGTH_VEC(key_length, value_length) (((uint32_t)(key_length) << 24) | (uint32_t)(value_length)) - // the rest is meaningful only when kv_length_vec != 0 - uint32_t expire_time; - uint64_t key_hash; - uint8_t data[0]; + // the rest is meaningful only when kv_length_vec != 0 + uint32_t expire_time; + uint64_t key_hash; + uint8_t data[0]; }; #define MEHCACHED_MAX_POOLS (16) -struct mehcached_table -{ +struct mehcached_table { #ifdef MEHCACHED_ALLOC_POOL - struct mehcached_pool alloc[MEHCACHED_MAX_POOLS]; - uint8_t alloc_id_mask; - uint64_t mth_threshold; + struct mehcached_pool alloc[MEHCACHED_MAX_POOLS]; + uint8_t alloc_id_mask; + uint64_t mth_threshold; #endif #ifdef MEHCACHED_ALLOC_MALLOC - struct mehcached_malloc alloc; + struct mehcached_malloc alloc; #endif #ifdef MEHCACHED_ALLOC_DYNAMIC - struct mehcached_dynamic alloc; + struct mehcached_dynamic alloc; #endif - struct mehcached_bucket *buckets; - struct mehcached_bucket *extra_buckets; // = (buckets + num_buckets); extra_buckets[0] is not used because index 0 indicates "no more extra bucket" + struct mehcached_bucket *buckets; + struct mehcached_bucket * + extra_buckets; // = (buckets + num_buckets); extra_buckets[0] is not used because index 0 indicates "no more extra bucket" - uint8_t concurrent_access_mode; + uint8_t concurrent_access_mode; - uint32_t num_buckets; - uint32_t num_buckets_mask; - uint32_t num_extra_buckets; + uint32_t num_buckets; + uint32_t num_buckets_mask; + uint32_t num_extra_buckets; - struct - { - uint32_t lock; - uint32_t head; // 1-base; 0 = no extra bucket - } extra_bucket_free_list MEHCACHED_ALIGNED(64); + struct { + uint32_t lock; + uint32_t head; // 1-base; 0 = no extra bucket + } extra_bucket_free_list MEHCACHED_ALIGNED(64); - uint8_t rshift; + uint8_t rshift; #ifdef MEHCACHED_COLLECT_STATS - struct - { - size_t count; - size_t set_nooverwrite; - size_t set_new; - size_t set_inplace; - size_t set_evicted; - size_t get_found; - size_t get_notfound; - size_t test_found; - size_t test_notfound; - size_t delete_found; - size_t delete_notfound; - size_t cleanup; - size_t move_to_head_performed; - size_t move_to_head_skipped; - size_t move_to_head_failed; - } stats; + struct { + size_t count; + size_t set_nooverwrite; + size_t set_new; + size_t set_inplace; + size_t set_evicted; + size_t get_found; + size_t get_notfound; + size_t test_found; + size_t test_notfound; + size_t delete_found; + size_t delete_notfound; + size_t cleanup; + size_t move_to_head_performed; + size_t move_to_head_skipped; + size_t move_to_head_failed; + } stats; #endif } MEHCACHED_ALIGNED(64); -struct mehcached_prefetch_state -{ - struct mehcached_table *table; - struct mehcached_bucket *bucket; - uint64_t key_hash; +struct mehcached_prefetch_state { + struct mehcached_table *table; + struct mehcached_bucket *bucket; + uint64_t key_hash; }; -typedef enum _MEHCACHED_OPERATION -{ - MEHCACHED_NOOP_READ = 0, - MEHCACHED_NOOP_WRITE, - MEHCACHED_ADD, - MEHCACHED_SET, - MEHCACHED_GET, - MEHCACHED_TEST, - MEHCACHED_DELETE, - MEHCACHED_INCREMENT, +typedef enum _MEHCACHED_OPERATION { + MEHCACHED_NOOP_READ = 0, + MEHCACHED_NOOP_WRITE, + MEHCACHED_ADD, + MEHCACHED_SET, + MEHCACHED_GET, + MEHCACHED_TEST, + MEHCACHED_DELETE, + MEHCACHED_INCREMENT, } MEHCACHED_OPERATION; -struct mehcached_request -{ - // 0 - uint8_t operation; // of enum MEHCACHED_OPERATION type - uint8_t result; // of enum MEHCACHED_RESULT type - // 2 - uint16_t reserved0; - // 4 - uint32_t kv_length_vec; - // 8 - uint64_t key_hash; - // 16 - uint32_t expire_time; - // 20 - uint32_t reserved1; - // 24 +struct mehcached_request { + // 0 + uint8_t operation; // of enum MEHCACHED_OPERATION type + uint8_t result; // of enum MEHCACHED_RESULT type + // 2 + uint16_t reserved0; + // 4 + uint32_t kv_length_vec; + // 8 + uint64_t key_hash; + // 16 + uint32_t expire_time; + // 20 + uint32_t reserved1; + // 24 }; +void mehcached_print_bucket(const struct mehcached_bucket *bucket); -void -mehcached_print_bucket(const struct mehcached_bucket *bucket); - - -void -mehcached_print_buckets(const struct mehcached_table *table); - - -void -mehcached_print_stats(const struct mehcached_table *table); - - -void -mehcached_reset_table_stats(struct mehcached_table *table); - - -uint32_t -mehcached_calc_bucket_index(const struct mehcached_table *table, uint64_t key_hash); - - -uint16_t -mehcached_calc_tag(uint64_t key_hash); - - -void -mehcached_set_item(struct mehcached_item *item, uint64_t key_hash, const uint8_t *key, uint32_t key_length, const uint8_t *value, uint32_t value_length, uint32_t expire_time); - - -void -mehcached_set_item_value(struct mehcached_item *item, const uint8_t *value, uint32_t value_length, uint32_t expire_time); - - -bool -mehcached_compare_keys(const uint8_t *key1, size_t key1_len, const uint8_t *key2, size_t key2_len); - - -void -mehcached_cleanup_all(uint8_t current_alloc_id, struct mehcached_table *table); +void mehcached_print_buckets(const struct mehcached_table *table); +void mehcached_print_stats(const struct mehcached_table *table); -void -mehcached_prefetch_table(struct mehcached_table *table, uint64_t key_hash, struct mehcached_prefetch_state *out_prefetch_state); +void mehcached_reset_table_stats(struct mehcached_table *table); +uint32_t mehcached_calc_bucket_index(const struct mehcached_table *table, uint64_t key_hash); -void -mehcached_prefetch_alloc(struct mehcached_prefetch_state *in_out_prefetch_state); +uint16_t mehcached_calc_tag(uint64_t key_hash); +void mehcached_set_item(struct mehcached_item *item, uint64_t key_hash, const uint8_t *key, uint32_t key_length, const uint8_t *value, + uint32_t value_length, uint32_t expire_time); -bool -mehcached_get(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length, uint8_t *out_value, size_t *in_out_value_length, uint32_t *out_expire_time, bool readonly); +void mehcached_set_item_value(struct mehcached_item *item, const uint8_t *value, uint32_t value_length, uint32_t expire_time); +bool mehcached_compare_keys(const uint8_t *key1, size_t key1_len, const uint8_t *key2, size_t key2_len); -bool -mehcached_test(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length); +void mehcached_cleanup_all(uint8_t current_alloc_id, struct mehcached_table *table); -bool -mehcached_set(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length, const uint8_t *value, size_t value_length, uint32_t expire_time, bool overwrite); +void mehcached_prefetch_table(struct mehcached_table *table, uint64_t key_hash, struct mehcached_prefetch_state *out_prefetch_state); +void mehcached_prefetch_alloc(struct mehcached_prefetch_state *in_out_prefetch_state); -bool -mehcached_delete(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length); +bool mehcached_get(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length, + uint8_t *out_value, size_t *in_out_value_length, uint32_t *out_expire_time, bool readonly); +bool mehcached_test(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length); -bool -mehcached_increment(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length, uint64_t increment, uint64_t *out_new_value, uint32_t expire_time); +bool mehcached_set(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, size_t key_length, + const uint8_t *value, size_t value_length, uint32_t expire_time, bool overwrite); +bool mehcached_delete(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, + size_t key_length); -void -mehcached_process_batch(uint8_t current_alloc_id, struct mehcached_table *table, struct mehcached_request *requests, size_t num_requests, const uint8_t *in_data, uint8_t *out_data, size_t *out_data_length, bool readonly); +bool mehcached_increment(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t key_hash, const uint8_t *key, + size_t key_length, uint64_t increment, uint64_t *out_new_value, uint32_t expire_time); +void mehcached_process_batch(uint8_t current_alloc_id, struct mehcached_table *table, struct mehcached_request *requests, + size_t num_requests, const uint8_t *in_data, uint8_t *out_data, size_t *out_data_length, bool readonly); -void -mehcached_table_reset(struct mehcached_table *table); +void mehcached_table_reset(struct mehcached_table *table); +void mehcached_table_init(struct mehcached_table *table, size_t num_buckets, size_t num_pools, size_t pool_size, + bool concurrent_table_read, bool concurrent_table_write, bool concurrent_alloc_write, size_t table_numa_node, + size_t alloc_numa_nodes[], double mth_threshold); -void -mehcached_table_init(struct mehcached_table *table, size_t num_buckets, size_t num_pools, size_t pool_size, bool concurrent_table_read, bool concurrent_table_write, bool concurrent_alloc_write, size_t table_numa_node, size_t alloc_numa_nodes[], double mth_threshold); - - -void -mehcached_table_free(struct mehcached_table *table); - - -uint32_t -mehcached_read_version_begin(const struct mehcached_table *table MEHCACHED_UNUSED, const struct mehcached_bucket *bucket MEHCACHED_UNUSED); +void mehcached_table_free(struct mehcached_table *table); +uint32_t mehcached_read_version_begin(const struct mehcached_table *table MEHCACHED_UNUSED, + const struct mehcached_bucket *bucket MEHCACHED_UNUSED); //uint64_t -uint32_t -mehcached_read_version_end(const struct mehcached_table *table MEHCACHED_UNUSED, const struct mehcached_bucket *bucket MEHCACHED_UNUSED); - - -void -mehcached_lock_bucket(const struct mehcached_table *table MEHCACHED_UNUSED, struct mehcached_bucket *bucket MEHCACHED_UNUSED); - - -void -mehcached_unlock_bucket(const struct mehcached_table *table MEHCACHED_UNUSED, struct mehcached_bucket *bucket MEHCACHED_UNUSED); - - -void -mehcached_lock_extra_bucket_free_list(struct mehcached_table *table); - - -void -mehcached_unlock_extra_bucket_free_list(struct mehcached_table *table); - - -bool -mehcached_has_extra_bucket(struct mehcached_bucket *bucket MEHCACHED_UNUSED); - - -struct mehcached_bucket * -mehcached_extra_bucket(const struct mehcached_table *table, uint32_t extra_bucket_index); - - -bool -mehcached_alloc_extra_bucket(struct mehcached_table *table, struct mehcached_bucket *bucket); - - -void -mehcached_free_extra_bucket(struct mehcached_table *table, struct mehcached_bucket *bucket); +uint32_t mehcached_read_version_end(const struct mehcached_table *table MEHCACHED_UNUSED, + const struct mehcached_bucket *bucket MEHCACHED_UNUSED); +void mehcached_lock_bucket(const struct mehcached_table *table MEHCACHED_UNUSED, struct mehcached_bucket *bucket MEHCACHED_UNUSED); -void -mehcached_fill_hole(struct mehcached_table *table, struct mehcached_bucket *bucket, size_t unused_item_index); +void mehcached_unlock_bucket(const struct mehcached_table *table MEHCACHED_UNUSED, struct mehcached_bucket *bucket MEHCACHED_UNUSED); +void mehcached_lock_extra_bucket_free_list(struct mehcached_table *table); -size_t -mehcached_find_empty(struct mehcached_table *table, struct mehcached_bucket *bucket, struct mehcached_bucket **located_bucket); +void mehcached_unlock_extra_bucket_free_list(struct mehcached_table *table); +bool mehcached_has_extra_bucket(struct mehcached_bucket *bucket MEHCACHED_UNUSED); -size_t -mehcached_find_empty_or_oldest(const struct mehcached_table *table, struct mehcached_bucket *bucket, struct mehcached_bucket **located_bucket); +struct mehcached_bucket *mehcached_extra_bucket(const struct mehcached_table *table, uint32_t extra_bucket_index); +bool mehcached_alloc_extra_bucket(struct mehcached_table *table, struct mehcached_bucket *bucket); -size_t -mehcached_find_item_index(const struct mehcached_table *table, struct mehcached_bucket *bucket, uint64_t key_hash, uint16_t tag, const uint8_t *key, size_t key_length, struct mehcached_bucket **located_bucket); +void mehcached_free_extra_bucket(struct mehcached_table *table, struct mehcached_bucket *bucket); +void mehcached_fill_hole(struct mehcached_table *table, struct mehcached_bucket *bucket, size_t unused_item_index); -size_t -mehcached_find_same_tag(const struct mehcached_table *table, struct mehcached_bucket *bucket, uint16_t tag, struct mehcached_bucket **located_bucket); +size_t mehcached_find_empty(struct mehcached_table *table, struct mehcached_bucket *bucket, struct mehcached_bucket **located_bucket); +size_t mehcached_find_empty_or_oldest(const struct mehcached_table *table, struct mehcached_bucket *bucket, + struct mehcached_bucket **located_bucket); -void -mehcached_cleanup_bucket(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t old_tail, uint64_t new_tail); +size_t mehcached_find_item_index(const struct mehcached_table *table, struct mehcached_bucket *bucket, uint64_t key_hash, uint16_t tag, + const uint8_t *key, size_t key_length, struct mehcached_bucket **located_bucket); +size_t mehcached_find_same_tag(const struct mehcached_table *table, struct mehcached_bucket *bucket, uint16_t tag, + struct mehcached_bucket **located_bucket); -void -mehcached_table_free(struct mehcached_table *table); +void mehcached_cleanup_bucket(uint8_t current_alloc_id, struct mehcached_table *table, uint64_t old_tail, uint64_t new_tail); +void mehcached_table_free(struct mehcached_table *table); MEHCACHED_END \ No newline at end of file diff --git a/examples/unit-tests/userspace-chain.c b/examples/unit-tests/userspace-chain.c index 317ed65..75b7813 100644 --- a/examples/unit-tests/userspace-chain.c +++ b/examples/unit-tests/userspace-chain.c @@ -76,10 +76,17 @@ struct guest_queue *guest_queues[FLASH_MAX_SOCKETS][FLASH_MAX_SOCKETS]; ///////////// guest ring buffer operations ///////////// +#if defined(__ARM_ARCH_ISA_A64) +#define guest_cpu_relax() \ + do { \ + asm volatile("yield\n" : : : "memory"); \ + } while (0) +#elif defined(__x86_64__) #define guest_cpu_relax() \ do { \ asm volatile("pause\n" : : : "memory"); \ } while (0) +#endif static inline __u32 guest_move_prod_head(struct guest_queue *r, __u32 n, __u32 *old_head, __u32 *new_head) { From ba92c5e1bbebb3c3ce7e71b10cb00831b4f0cc92 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Mon, 1 Sep 2025 18:26:35 +0530 Subject: [PATCH 18/43] fix: static library build fail --- examples/arpresolver/main.c | 9 ++-- examples/firewall/main.c | 15 +++---- examples/ip4ping/main.c | 9 ++-- examples/l2fwd/main.c | 3 +- examples/simplefwd/main.c | 3 +- examples/txgen/main.c | 3 +- examples/unit-tests/correctness.c | 70 +++++++++++++------------------ examples/unit-tests/fwddrop.c | 3 +- examples/unit-tests/fwdrr.c | 3 +- lib/flash/nf/flash_nf.c | 8 ++-- lib/flash/nf/flash_nf.h | 1 - lib/flash/nf/flash_stats.c | 2 +- lib/include/flash_defines.h | 2 +- 13 files changed, 63 insertions(+), 68 deletions(-) diff --git a/examples/arpresolver/main.c b/examples/arpresolver/main.c index e6bf2ea..71e4e49 100644 --- a/examples/arpresolver/main.c +++ b/examples/arpresolver/main.c @@ -28,7 +28,7 @@ #define PROTO_STRLEN 4 #define IFNAME_STRLEN 256 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -178,7 +178,7 @@ static void *socket_routine(void *arg) struct sock_args *a = (struct sock_args *)arg; int socket_id = a->socket_id; - + log_info("SOCKET_ID: %d", socket_id); xsk = nf->thread[socket_id]->socket; @@ -302,6 +302,7 @@ int main(int argc, char **argv) cfg->app_name = "ARP Resolver"; cfg->app_options = arpresolver_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) @@ -320,7 +321,7 @@ int main(int argc, char **argv) log_error("ERROR: Unable to get MAC address for interface %s", cfg->ifname); goto out_cfg; } - + memcpy(src_mac, tmp_addr.ether_addr_octet, ETH_ALEN); // Parse JSON @@ -360,7 +361,7 @@ int main(int argc, char **argv) stats_cfg.nf = nf; stats_cfg.cfg = cfg; - if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)){ + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); goto out_args; } diff --git a/examples/firewall/main.c b/examples/firewall/main.c index 92c01ee..9e313e5 100644 --- a/examples/firewall/main.c +++ b/examples/firewall/main.c @@ -25,7 +25,7 @@ #define IFNAME_STRLEN 256 #define NUM_INVALID_SESSIONS 1000 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -64,7 +64,7 @@ static void *configure(void) { // Initialise invalid_sessions with random numbers srand(time(NULL)); // Seed only once before generating any random numbers - + for (int i = 0; i < NUM_INVALID_SESSIONS; i++) { int r = rand(); // Different value each iteration invalid_sessions[i] = r; @@ -100,7 +100,7 @@ static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int s case 's': app_conf->stats_cpu = atoi(optarg); break; - default: + default: printf("Usage: %s -h\n", argv[-shift]); return -1; } @@ -123,7 +123,7 @@ static void *socket_routine(void *arg) int socket_id = a->socket_id; - log_info("SOCKET_ID: %d", socket_id); + log_info("SOCKET_ID: %d", socket_id); xsk = nf->thread[socket_id]->socket; @@ -155,7 +155,7 @@ static void *socket_routine(void *arg) ret = flash__poll(cfg, xsk, fds, nfds); if (!(ret == 1 || ret == -2)) continue; - + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); wsend = 0; wdrop = 0; @@ -272,14 +272,15 @@ int main(int argc, char **argv) cfg->app_name = "Firewall Application"; cfg->app_options = firewall_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) goto out_cfg; - + if (parse_app_args(argc, argv, &app_conf, shift) < 0) goto out_cfg; - + if (flash__configure_nf(&nf, cfg) < 0) goto out_cfg; diff --git a/examples/ip4ping/main.c b/examples/ip4ping/main.c index f4a6c04..5dc876e 100644 --- a/examples/ip4ping/main.c +++ b/examples/ip4ping/main.c @@ -17,7 +17,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -232,7 +232,7 @@ static void *socket_routine(void *arg) dropvecs[wdrop++] = xskvecs[i]; if (ntohs(eth->h_proto) != ETH_P_IP || len < (sizeof(*eth) + sizeof(*ip) + sizeof(*icmp)) || - ip->protocol != IPPROTO_ICMP || icmp->type != ICMP_ECHO) { + ip->protocol != IPPROTO_ICMP || icmp->type != ICMP_ECHO) { sendvecs[wsend++] = xskvecs[i]; if (app_conf.sriov) { @@ -291,14 +291,15 @@ int main(int argc, char **argv) cfg->app_name = "IP4 Ping Application"; cfg->app_options = ip4ping_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) goto out_cfg; - + if (parse_app_args(argc, argv, &app_conf, shift) < 0) goto out_cfg; - + if (flash__configure_nf(&nf, cfg) < 0) goto out_cfg; diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c index f3820d4..eb8bdf5 100644 --- a/examples/l2fwd/main.c +++ b/examples/l2fwd/main.c @@ -13,7 +13,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; @@ -190,6 +190,7 @@ int main(int argc, char **argv) cfg->app_name = "L2 Forwarding Application"; cfg->app_options = l2fwd_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) diff --git a/examples/simplefwd/main.c b/examples/simplefwd/main.c index c8abc8b..ba72e68 100644 --- a/examples/simplefwd/main.c +++ b/examples/simplefwd/main.c @@ -11,7 +11,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; @@ -150,6 +150,7 @@ int main(int argc, char **argv) cfg->app_name = "L2 Forwarding Application"; cfg->app_options = l2fwd_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) diff --git a/examples/txgen/main.c b/examples/txgen/main.c index 3e752ee..6afd199 100644 --- a/examples/txgen/main.c +++ b/examples/txgen/main.c @@ -18,7 +18,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; uint8_t *packet_template = NULL; @@ -291,6 +291,7 @@ int main(int argc, char **argv) cfg->app_name = "Traffic Generation Application"; cfg->app_options = txgen_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) diff --git a/examples/unit-tests/correctness.c b/examples/unit-tests/correctness.c index 3fe8e38..e5eebe0 100644 --- a/examples/unit-tests/correctness.c +++ b/examples/unit-tests/correctness.c @@ -21,7 +21,7 @@ #define TEST_PORT 8080 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; struct test_stats *stats_arr; @@ -47,7 +47,7 @@ struct nf_info { bool first_packet_received; uint64_t expected_mod_value; uint64_t next_expected_pkt_id; -} nf_info_arr[MAX_NFS] = {0}; +} nf_info_arr[MAX_NFS] = { 0 }; struct test_stats { uint64_t pkt_count; @@ -63,13 +63,8 @@ struct appconf { int hops; } app_conf; -static const char *correctness_options[] = { - "-c \tStart CPU (default: 0)", - "-e \tEnd CPU (default: 0)", - "-s \tStats CPU (default: 1)", - "-h \tNumber of hops (default: 1)", - NULL -}; +static const char *correctness_options[] = { "-c \tStart CPU (default: 0)", "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", "-h \tNumber of hops (default: 1)", NULL }; static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { @@ -164,28 +159,28 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) payload_len = ntohs(udphdr->len) - sizeof(struct udphdr); size_t testHeaderLen = sizeof(struct testHeader); - void *payload_end = pos + payload_len; + void *payload_end = pos + payload_len; struct testHeader *testHeader = NULL; /* First NF */ if (ntohs(udphdr->dest) != TEST_PORT) { // Append test header at the end of the UDP payload - testHeader = (struct testHeader *)payload_end; - testHeader->lastHop = app_conf.hops; - testHeader->hopCount = 1; - testHeader->old_dst = udphdr->dest; - - *len += testHeaderLen; - udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); - iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); - - udphdr->dest = htons(TEST_PORT); - - stats->pkt_correct++; + testHeader = (struct testHeader *)payload_end; + testHeader->lastHop = app_conf.hops; + testHeader->hopCount = 1; + testHeader->old_dst = udphdr->dest; + + *len += testHeaderLen; + udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); + iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); + + udphdr->dest = htons(TEST_PORT); + + stats->pkt_correct++; } else { - // testHeader is at the end of the UDP payload - testHeader = (struct testHeader *)(payload_end - testHeaderLen); + // testHeader is at the end of the UDP payload + testHeader = (struct testHeader *)(payload_end - testHeaderLen); testHeader->hopCount++; @@ -205,7 +200,7 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) } struct nf_info *sender_info = &nf_info_arr[sender_nf_id]; - + if (!sender_info->first_packet_received) { sender_info->first_packet_received = true; sender_info->sender_next_size = sender_next_size; @@ -214,26 +209,22 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) stats->pkt_correct++; // first packet is always correct } else { if (sender_next_size != sender_info->sender_next_size) { - log_error("ERROR: nf_next_size mismatch for NF ID %d: expected %d, got %d", - sender_nf_id, - sender_info->sender_next_size, - sender_next_size); + log_error("ERROR: nf_next_size mismatch for NF ID %d: expected %d, got %d", sender_nf_id, + sender_info->sender_next_size, sender_next_size); stats->pkt_corrupted++; goto test_header_update; } if (received_pktId % sender_next_size != sender_info->expected_mod_value) { log_error("ERROR: pktId %% sender_next_size mismatch for NF ID %d: expected %lu, got %lu", - sender_nf_id, - sender_info->expected_mod_value, - received_pktId % sender_next_size); + sender_nf_id, sender_info->expected_mod_value, received_pktId % sender_next_size); stats->pkt_corrupted++; goto test_header_update; } uint64_t next_expected_pkt_id = sender_info->next_expected_pkt_id; - if(received_pktId != next_expected_pkt_id) { + if (received_pktId != next_expected_pkt_id) { if (received_pktId < next_expected_pkt_id) { stats->pkt_corrupted++; } else { @@ -245,10 +236,8 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) stats->pkt_correct++; } } - } - test_header_update: testHeader->pktId = stats->pkt_count++; testHeader->sender_nf_id = cfg->nf_id; @@ -265,11 +254,11 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) udphdr->dest = tmp_port; udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); *len -= testHeaderLen; - + tmp_port = udphdr->dest; udphdr->dest = udphdr->source; udphdr->source = tmp_port; - + iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); memcpy(tmp_mac, eth->h_dest, ETH_ALEN); @@ -326,7 +315,7 @@ static void *socket_routine(void *arg) xskvecs[i].options = ((count % next_size) << 16) | (xskvecs[i].options & 0xFFFF); count++; } - + char *pkt = xskvecs[i].data; if (!nb_frags++) @@ -395,15 +384,16 @@ int main(int argc, char **argv) cfg->app_name = "Correctness Test Application"; cfg->app_options = correctness_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) goto out_cfg; - + if (parse_app_args(argc, argv, &app_conf, shift) < 0) goto out_cfg; - + if (flash__configure_nf(&nf, cfg) < 0) goto out_cfg; diff --git a/examples/unit-tests/fwddrop.c b/examples/unit-tests/fwddrop.c index 5713e84..b426f5e 100644 --- a/examples/unit-tests/fwddrop.c +++ b/examples/unit-tests/fwddrop.c @@ -12,7 +12,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; @@ -224,6 +224,7 @@ int main(int argc, char **argv) cfg->app_name = "Unit Test: Forward and Drop Application"; cfg->app_options = fwddrop_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) diff --git a/examples/unit-tests/fwdrr.c b/examples/unit-tests/fwdrr.c index f31c903..e30164e 100644 --- a/examples/unit-tests/fwdrr.c +++ b/examples/unit-tests/fwdrr.c @@ -12,7 +12,7 @@ #include #include -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf = NULL; @@ -197,6 +197,7 @@ int main(int argc, char **argv) cfg->app_name = "Round-Robin Forwarding Application"; cfg->app_options = fwdrr_options; + cfg->done = &done; shift = flash__parse_cmdline_args(argc, argv, cfg); if (shift < 0) diff --git a/lib/flash/nf/flash_nf.c b/lib/flash/nf/flash_nf.c index bfa6e96..e8d4027 100644 --- a/lib/flash/nf/flash_nf.c +++ b/lib/flash/nf/flash_nf.c @@ -14,8 +14,6 @@ #include "flash_nf.h" -bool done; - static int set_nonblocking(int sockfd) { int flags = fcntl(sockfd, F_GETFL, 0); @@ -40,7 +38,7 @@ void flash__wait(struct config *cfg) if (set_nonblocking(cfg->uds_sockfd) < 0) log_warn("Failed to set UDS socket to non-blocking mode"); - while (!done) { + while (!*cfg->done) { int bytes_received = read(cfg->uds_sockfd, &cmd, sizeof(int)); if (bytes_received < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) { @@ -51,11 +49,11 @@ void flash__wait(struct config *cfg) } } else if (bytes_received == 0) { log_info("Server closed the connection"); - done = true; + *cfg->done = true; break; } else { log_info("Received signal from server"); - done = true; + *cfg->done = true; } } } diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index 35d6fc3..bf60a48 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -28,7 +28,6 @@ struct stats_conf { struct config *cfg; }; -extern bool done; struct ether_addr; /* Control Path APIs */ diff --git a/lib/flash/nf/flash_stats.c b/lib/flash/nf/flash_stats.c index c7acfca..5f851d3 100644 --- a/lib/flash/nf/flash_stats.c +++ b/lib/flash/nf/flash_stats.c @@ -240,7 +240,7 @@ void *flash__stats_thread(void *conf) for (int i = 0; i < cfg->total_sockets; i++) nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - while (!done) { + while (!*cfg->done) { sleep(interval); if (system("clear") != 0) log_error("Terminal clear error"); diff --git a/lib/include/flash_defines.h b/lib/include/flash_defines.h index 356c016..abf86b0 100644 --- a/lib/include/flash_defines.h +++ b/lib/include/flash_defines.h @@ -70,6 +70,7 @@ struct config { int umem_offset; bool frags_enabled; bool rx_first; + volatile bool *done; #ifdef STATS clockid_t clock; int verbose; @@ -175,7 +176,6 @@ struct socket { void *flash_pool; uint32_t outstanding_tx; uint64_t idle_timestamp; - #ifdef STATS struct xsk_ring_stats ring_stats; struct xsk_app_stats app_stats; From 9bf71d12513f01ca0cb2a389f20aad3dc4b9460c Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 10 Sep 2025 16:19:43 +0530 Subject: [PATCH 19/43] fix: meson arguments for libraries - log_use_color as well as debug logs were not getting printed due to wrong setup of meson arguments. - meson uses a wrapper arround cargo to build the rust flash libraries and examples (Issue #1) --- examples/meson.build | 17 ----------------- lib/flash/log/meson.build | 16 +--------------- meson.build | 22 ++++++++++++++-------- meson_options.txt | 2 +- 4 files changed, 16 insertions(+), 41 deletions(-) diff --git a/examples/meson.build b/examples/meson.build index d9cd80e..263b790 100644 --- a/examples/meson.build +++ b/examples/meson.build @@ -16,19 +16,6 @@ dirs = [ 'txgen' ] -# Rust examples (only if Rust is enabled) -rust_dirs = [] -if get_option('enable_rust') - rust_dirs = [ - 'helloworld-rs', - 'simplefwd-rs', - 'l2fwd-rs', - 'ip4ping-rs', - 'maglev-rs', - 'firewall-rs' - ] -endif - def_deps = [include, log, nf, params, uds] foreach d : dirs @@ -68,9 +55,5 @@ if get_option('enable_rust') and cargo.found() build_always_stale: true, ) - foreach rust_dir : rust_dirs - message('Rust example: ' + rust_dir + ' will be built with the workspace') - endforeach - message('<<< Rust examples configured') endif \ No newline at end of file diff --git a/lib/flash/log/meson.build b/lib/flash/log/meson.build index 11e5b01..0cc4d73 100644 --- a/lib/flash/log/meson.build +++ b/lib/flash/log/meson.build @@ -4,21 +4,7 @@ sources = files('log.c') headers = files('log.h') -log_c_args = [] -if get_option('log_use_color') - log_c_args += ['-DLOG_USE_COLOR'] -endif - -if get_option('buildtype') == 'debug' or get_option('buildtype') == 'debugoptimized' - log_c_args += ['-DLOG_ENABLE_DEBUG'] - message('Log library: Debug/trace logging enabled for build type: ' + get_option('buildtype')) -else - message('Log library: Debug/trace logging disabled for build type: ' + get_option('buildtype')) -endif - -liblog = library(libname, sources, - install: true, - c_args: log_c_args) +liblog = library(libname, sources, install: true) log = declare_dependency(link_with: liblog, include_directories: include_directories('.')) flash_libs += log \ No newline at end of file diff --git a/meson.build b/meson.build index 5e35e65..e5d8c01 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'flash', 'C', - version: '25.5-beta', + version: '25.9-beta', license: 'Apache-2.0', default_options: [ 'buildtype=release', @@ -19,6 +19,7 @@ use_static_libs = get_option('default_library') == 'static' cc = meson.get_compiler('c') +# Enable Rust support if the option is set and Cargo is available enable_rust = get_option('enable_rust') cargo = find_program('cargo', required: enable_rust) rust_build = [] @@ -55,7 +56,6 @@ if cc.has_header('ncurses.h', required: true) add_project_link_arguments('-lncurses', language: 'c') endif -# if link_lib is empty, do not add it to project properties add_project_link_arguments('-ldl', language: 'c') xdp_dep = dependency( @@ -134,6 +134,18 @@ foreach arg : warning_flags endif endforeach +# set loggging options for log library +if get_option('log_use_color') + add_project_arguments('-DLOG_USE_COLOR', language: 'c') +endif + +if get_option('buildtype') == 'debug' or get_option('buildtype') == 'debugoptimized' + add_project_arguments('-DLOG_ENABLE_DEBUG', language: 'c') + message('Log library: debug/trace logging enabled for build type - ' + get_option('buildtype')) +else + message('Log library: debug/trace logging disabled for build type - ' + get_option('buildtype')) +endif + # specify -D_GNU_SOURCE unconditionally add_project_arguments('-D_GNU_SOURCE', language: 'c') @@ -185,9 +197,3 @@ pkg.generate( install_dir: 'lib/pkgconfig', ) message('<<< Done pkg-config file') - -# Rust builds are now handled in examples/meson.build -if enable_rust and cargo.found() - message('>>> Rust builds will be handled by examples/meson.build') - message('<<< Rust builds configured') -endif \ No newline at end of file diff --git a/meson_options.txt b/meson_options.txt index 6fe49f6..032ea14 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,5 +4,5 @@ option('log_use_color', type: 'boolean', value: true, description: 'Enable colored output in the log library') -option('enable_rust', type: 'boolean', value: false, +option('enable_rust', type: 'boolean', value: true, description: 'Enable building Rust applications and libraries') From e88de2e58316fca67c1518733957478e5c8d69ae Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 10 Sep 2025 16:36:35 +0530 Subject: [PATCH 20/43] feat: added util library for L7 applications --- lib/flash/meson.build | 2 +- lib/flash/util/cpu.c | 87 ++++++++ lib/flash/util/http_parsing.c | 294 +++++++++++++++++++++++++ lib/flash/util/include/cpu.h | 38 ++++ lib/flash/util/include/http_parsing.h | 77 +++++++ lib/flash/util/include/netlib.h | 81 +++++++ lib/flash/util/include/tdate_parse.h | 54 +++++ lib/flash/util/meson.build | 23 ++ lib/flash/util/netlib.c | 275 +++++++++++++++++++++++ lib/flash/util/tdate_parse.c | 299 ++++++++++++++++++++++++++ meson.build | 5 + 11 files changed, 1234 insertions(+), 1 deletion(-) create mode 100644 lib/flash/util/cpu.c create mode 100644 lib/flash/util/http_parsing.c create mode 100644 lib/flash/util/include/cpu.h create mode 100644 lib/flash/util/include/http_parsing.h create mode 100644 lib/flash/util/include/netlib.h create mode 100644 lib/flash/util/include/tdate_parse.h create mode 100644 lib/flash/util/meson.build create mode 100644 lib/flash/util/netlib.c create mode 100644 lib/flash/util/tdate_parse.c diff --git a/lib/flash/meson.build b/lib/flash/meson.build index df73063..b884837 100644 --- a/lib/flash/meson.build +++ b/lib/flash/meson.build @@ -5,7 +5,7 @@ special_dirs = [ 'log', ] -dirs = ['uds', 'common', 'monitor', 'params', 'pool', 'nf'] +dirs = ['uds', 'common', 'monitor', 'params', 'pool', 'nf', 'util'] foreach special_dir : special_dirs sources = [] diff --git a/lib/flash/util/cpu.c b/lib/flash/util/cpu.c new file mode 100644 index 0000000..cb634f3 --- /dev/null +++ b/lib/flash/util/cpu.c @@ -0,0 +1,87 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +/*----------------------------------------------------------------------------*/ +int GetNumCPUs(void) +{ + return sysconf(_SC_NPROCESSORS_ONLN); +} +/*----------------------------------------------------------------------------*/ +int CoreAffinitize(int cpu) +{ + cpu_set_t *cmask; + struct bitmask *bmask; + size_t n; + int ret; + + n = GetNumCPUs(); + + if (cpu < 0 || cpu >= (int)n) { + errno = -EINVAL; + return -1; + } + + cmask = CPU_ALLOC(n); + if (cmask == NULL) + return -1; + + CPU_ZERO_S(n, cmask); + CPU_SET_S(cpu, n, cmask); + + ret = sched_setaffinity(0, n, cmask); + + CPU_FREE(cmask); + + if (numa_max_node() == 0) + return ret; + + bmask = numa_bitmask_alloc(16); + assert(bmask); + + numa_bitmask_setbit(bmask, cpu % 2); + numa_set_membind(bmask); + numa_bitmask_free(bmask); + + return ret; +} diff --git a/lib/flash/util/http_parsing.c b/lib/flash/util/http_parsing.c new file mode 100644 index 0000000..b35d7be --- /dev/null +++ b/lib/flash/util/http_parsing.c @@ -0,0 +1,294 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include "http_parsing.h" +#include "tdate_parse.h" + +#define SPACE_OR_TAB(x) ((x) == ' ' || (x) == '\t') +#define CR_OR_NEWLINE(x) ((x) == '\r' || (x) == '\n') + +/*---------------------------------------------------------------------------*/ +static char *nre_strcasestr(const char *buf, const char *key) +{ + int n = strlen(key) - 1; + const char *p = buf; + + while (*p) { + while (*p && *p != *key) /* first character match */ + p++; + + if (*p == '\0') + return (NULL); + + if (!strncasecmp(p + 1, key + 1, n)) + return (char *)(uintptr_t)p; + p++; + } + return NULL; +} +/*--------------------------------------------------------------------------*/ +int find_http_header(char *data, int len) +{ + char *temp = data; + int hdr_len = 0; + char ch = data[len]; /* remember it */ + + /* null terminate the string first */ + data[len] = 0; + while (!hdr_len && (temp = strchr(temp, '\n')) != NULL) { + temp++; + if (*temp == '\n') + hdr_len = temp - data + 1; + else if (len > 0 && *temp == '\r' && *(temp + 1) == '\n') + hdr_len = temp - data + 2; + } + data[len] = ch; /* put it back */ + + /* terminate the header if found */ + if (hdr_len) + data[hdr_len - 1] = 0; + + return hdr_len; +} +/*--------------------------------------------------------------------------*/ +int is_http_request(char *data, int len) +{ + if (len >= (int)sizeof(HTTP_GET) - 1 && !strncmp(data, HTTP_GET, sizeof(HTTP_GET) - 1)) + return GET; + + if (len >= (int)sizeof(HTTP_POST) - 1 && !strncmp(data, HTTP_POST, sizeof(HTTP_POST) - 1)) + return POST; + + return 0; +} +/*--------------------------------------------------------------------------*/ +int is_http_response(char *data, int len) +{ + if (len < (int)(sizeof(HTTP_STR) - 1)) + return 0; + + if (!strncmp(data, HTTP_STR, sizeof(HTTP_STR) - 1)) + return 1; + + return 0; +} +/*---------------------------------------------------------------------------*/ +char *http_header_str_val(const char *buf, const char *key, const int keylen, char *value, int value_len) +{ + char *temp = nre_strcasestr(buf, key); + int i = 0; + + if (temp == NULL) { + *value = 0; + return NULL; + } + + /* skip whitespace or tab */ + temp += keylen; + while (*temp && SPACE_OR_TAB(*temp)) + temp++; + + /* if we reached the end of the line, forget it */ + if (*temp == '\0' || CR_OR_NEWLINE(*temp)) { + *value = 0; + return NULL; + } + + /* copy value data */ + while (*temp && !CR_OR_NEWLINE(*temp) && i < value_len - 1) + value[i++] = *temp++; + value[i] = 0; + + if (i == 0) { + *value = 0; + return NULL; + } + + return value; +} +/*---------------------------------------------------------------------------*/ +long int http_header_long_val(const char *response, const char *key, int key_len) +{ +#define C_TYPE_LEN 50 + long int len; + char value[C_TYPE_LEN]; + char *temp = http_header_str_val(response, key, key_len, value, C_TYPE_LEN); + + if (temp == NULL) + return -1; + + len = strtol(temp, NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return -1; + + return len; +} +/*--------------------------------------------------------------------------*/ +int http_parse_first_resp_line(const char *data, int len, int *scode, int *ver) +{ + (void)len; + const char *p = data; + + /* A typical first line: HTTP/1.1 200 OK */ + if (strncmp(p, HTTP_STR, sizeof(HTTP_STR) - 1) != 0) + return (0); + + /* version */ + p += sizeof(HTTP_STR); + if (strncmp(p, "1.1", 3) == 0) + *ver = HTTP_11; + else if (strncmp(p, "1.0", 3) == 0) + *ver = HTTP_10; + else + *ver = HTTP_09; + + /* status code */ + p += sizeof("1.1"); + *scode = strtol(p, NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return 0; + return 1; +} +/*--------------------------------------------------------------------------*/ +time_t http_header_date(const char *data, const char *field, int len) +{ + char buf[256]; + + if (!http_header_str_val(data, field, len, buf, sizeof(buf))) + return (time_t)-1; + return httpdate_to_timet(buf); +} +/*--------------------------------------------------------------------------*/ +int http_check_header_field(const char *data, const char *field) +{ + if (nre_strcasestr(data, field)) + return 1; + return 0; +} +/*--------------------------------------------------------------------------*/ +char *http_get_http_version_resp(char *data, int len, char *value, int value_len) +{ + char *temp = data; + int i = 0; + + if (len < (int)(sizeof(HTTP_STR) - 1)) { + *value = 0; + return NULL; + } + + if (strncmp(data, HTTP_STR, sizeof(HTTP_STR) - 1)) { + *value = 0; + return NULL; + } + + while (*temp && !SPACE_OR_TAB(*temp) && i < value_len - 1) + value[i++] = *temp++; + value[i] = 0; + + return value; +} +/*--------------------------------------------------------------------------*/ +char *http_get_url(char *data, int data_len, char *value, int value_len) +{ + (void)data_len; + char *ret = data; + char *temp; + int i = 0; + + if (strncmp(data, HTTP_GET, sizeof(HTTP_GET) - 1)) { + *value = 0; + return NULL; + } + + ret += sizeof(HTTP_GET); + while (*ret && SPACE_OR_TAB(*ret)) + ret++; + + temp = ret; + while (*temp && *temp != ' ' && i < value_len - 1) { + value[i++] = *temp++; + } + value[i] = 0; + + return ret; +} +/*---------------------------------------------------------------------------*/ +int http_get_status_code(void *response) +{ + int code = 0; + char *temp = response; + + while (*temp && !SPACE_OR_TAB(*temp++)) + ; + + code = strtol(temp, NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return -1; + + return code; +} +/*---------------------------------------------------------------------------*/ +int http_get_maxage(char *cache_ctl, int len) +{ +#define MAXAGE "max-age=" +#define SMAXAGE "s-maxage=" + + if (!*cache_ctl) + return -1; + + char *temp = NULL; + + temp = nre_strcasestr(cache_ctl, MAXAGE); + if (temp) { + len = strtol(temp + sizeof(MAXAGE), NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return -1; + return len; + } + + temp = nre_strcasestr(cache_ctl, SMAXAGE); + if (temp) { + len = strtol(temp + sizeof(SMAXAGE), NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return -1; + return len; + } + return -1; +} diff --git a/lib/flash/util/include/cpu.h b/lib/flash/util/include/cpu.h new file mode 100644 index 0000000..0d68625 --- /dev/null +++ b/lib/flash/util/include/cpu.h @@ -0,0 +1,38 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CPU_H_ +#define __CPU_H_ + +int GetNumCPUs(void); +int CoreAffinitize(int cpu); + +#endif /* __CPU_H_ */ diff --git a/lib/flash/util/include/http_parsing.h b/lib/flash/util/include/http_parsing.h new file mode 100644 index 0000000..8c567e9 --- /dev/null +++ b/lib/flash/util/include/http_parsing.h @@ -0,0 +1,77 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NRE_HTTP_PARSING +#define __NRE_HTTP_PARSING + +#define HTTP_STR "HTTP" +#define HTTPV0_STR "HTTP/1.0" +#define HTTPV1_STR "HTTP/1.1" +#define HTTP_GET "GET" +#define HTTP_POST "POST" +#define HTTP_CLOSE "Close" +#define HTTP_KEEP_ALIVE "Keep-Alive" +#define HOST_HDR "\nHost:" +#define CONTENT_LENGTH_HDR "\nContent-Length:" +#define CONTENT_TYPE_HDR "\nContent-Type:" +#define CACHE_CONTROL_HDR "\nCache-Control:" +#define CONNECTION_HDR "\nConnection:" +#define DATE_HDR "\nDate:" +#define EXPIRES_HDR "\nExpires:" +#define AGE_HDR "\nAge:" +#define LAST_MODIFIED_HDR "\nLast-Modified:" +#define IF_MODIFIED_SINCE_HDR "\nIf-Modified_Since:" +#define PRAGMA_HDR "\nPragma:" +#define RANGE_HDR "\nRange:" +#define IF_RANGE_HDR "\nIf-Range:" +#define ETAG_HDR "\nETag:" + +enum { GET = 1, POST }; + +int find_http_header(char *data, int len); +int is_http_response(char *data, int len); +int is_http_request(char *data, int len); + +char *http_header_str_val(const char *buf, const char *key, const int key_len, char *value, int value_len); +long int http_header_long_val(const char *buf, const char *key, int key_len); + +char *http_get_http_version_resp(char *data, int len, char *value, int value_len); +char *http_get_url(char *data, int data_len, char *value, int value_len); +int http_get_status_code(void *response); +int http_get_maxage(char *cache_ctl, int len); + +time_t http_header_date(const char *data, const char *field, int len); + +enum { HTTP_09, HTTP_10, HTTP_11 }; /* http version */ +int http_parse_first_resp_line(const char *data, int len, int *scode, int *ver); +int http_check_header_field(const char *data, const char *field); + +#endif diff --git a/lib/flash/util/include/netlib.h b/lib/flash/util/include/netlib.h new file mode 100644 index 0000000..fdc4fbe --- /dev/null +++ b/lib/flash/util/include/netlib.h @@ -0,0 +1,81 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _NET_LIB_H_ +#define _NET_LIB_H_ + +#include + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif + +#ifndef VERIFY +#define VERIFY(x) if (!(x)) {fprintf(stderr, "error: FILE:%s LINE:%d FUNC: %s", __FILE__, __LINE__, __FUNCTION__); assert(0);} +#endif + +#ifndef FREE +#define FREE(x) if (x) {free(x); (x) = NULL;} +#endif + +int GetNumCPUCores(void); +int AffinitizeThreadToCore(int core); +int CreateServerSocket(int port, int isNonBlocking); +int CreateConnectionSocket(in_addr_t addr, int port, int isNonBlocking); +int mystrtol(const char *nptr, int base); + +/* processing options */ +struct Options { + char *op_name; + char **op_varptr; + char *op_comment; +} Options; +void ParseOptions(int argc, const char** argv, struct Options* ops); +void PrintOptions(const struct Options* ops, int printVal); + + +/* HTTP header processing */ +char *GetHeaderString(const char *buf, const char* header, int hdrsize); +int GetHeaderLong(const char* buf, const char* header, int hdrsize, long int *val); + +#endif diff --git a/lib/flash/util/include/tdate_parse.h b/lib/flash/util/include/tdate_parse.h new file mode 100644 index 0000000..401a930 --- /dev/null +++ b/lib/flash/util/include/tdate_parse.h @@ -0,0 +1,54 @@ +/* tdate_parse.h - parse string dates into internal form, stripped-down version +** +** Copyright (C) 1995 by Jef Poskanzer . All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +** SUCH DAMAGE. +*/ + +#ifndef _TDATE_PARSE_H_ +#define _TDATE_PARSE_H_ + +/* convert a http date string to time_t format */ + +#ifdef __cplusplus +extern "C" { +#endif +extern time_t httpdate_to_timet(const char *str); + +/* + Convert 't' (in time_t format) into a HTTP date string + + + t: input (epoch-based time) + str: output string that holds the HTTP date strinng + strlen: the buffer size of str + + 0 : in case of successful conversion + -1 : otherwise + by KyoungSoo Park +*/ +extern int timet_to_httpdate(time_t t, char *str, int strlen); + +#ifdef __cplusplus +} +#endif +#endif /* _TDATE_PARSE_H_ */ diff --git a/lib/flash/util/meson.build b/lib/flash/util/meson.build new file mode 100644 index 0000000..3354e2f --- /dev/null +++ b/lib/flash/util/meson.build @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Debojeet Das + +sources = files( + 'tdate_parse.c', + 'http_parsing.c', + 'netlib.c', + 'cpu.c' +) + +headers = files( + 'include/tdate_parse.h', + 'include/http_parsing.h', + 'include/netlib.h', + 'include/cpu.h' +) + +deps += [] + +libutil = library(libname, sources, install: true, dependencies: deps, include_directories: include_directories('./include')) +util = declare_dependency(link_with: libutil, include_directories: include_directories('./include')) + +flash_libs += util \ No newline at end of file diff --git a/lib/flash/util/netlib.c b/lib/flash/util/netlib.c new file mode 100644 index 0000000..c465a1b --- /dev/null +++ b/lib/flash/util/netlib.c @@ -0,0 +1,275 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlib.h" + +/*----------------------------------------------------------------------------*/ +int GetNumCPUCores(void) +{ + return (int)sysconf(_SC_NPROCESSORS_ONLN); +} +/*----------------------------------------------------------------------------*/ +int AffinitizeThreadToCore(int core) +{ + cpu_set_t *cmask; + int n, ret; + + n = sysconf(_SC_NPROCESSORS_ONLN); + + if (core < 0 || core >= n) { + fprintf(stderr, "%d: invalid CPU number.\n", core); + return -1; + } + + cmask = CPU_ALLOC(n); + if (cmask == NULL) { + fprintf(stderr, "%d: uexpected cmask.\n", n); + return -1; + } + + CPU_ZERO_S(n, cmask); + CPU_SET_S(core, n, cmask); + + ret = sched_setaffinity(0, n, cmask); + + CPU_FREE(cmask); + return ret; +} +/*----------------------------------------------------------------------------*/ +int CreateServerSocket(int port, int isNonBlocking) +{ + int s; + struct sockaddr_in addr; + struct linger doLinger; + int doReuse = 1; + + if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + fprintf(stderr, "socket() failed, errno=%d msg=%s\n", errno, strerror(errno)); + return (-1); + } + + /* don't linger on close */ + doLinger.l_onoff = doLinger.l_linger = 0; + if (setsockopt(s, SOL_SOCKET, SO_LINGER, &doLinger, sizeof(doLinger)) == -1) { + close(s); + return (-1); + } + + /* reuse addresses */ + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &doReuse, sizeof(doReuse)) == -1) { + close(s); + return (-1); + } + + /* make the listening socket nonblocking */ + if (isNonBlocking) { + if (fcntl(s, F_SETFL, O_NDELAY) < 0) { + fprintf(stderr, "fcntl() failed, errno=%d msg=%s\n", errno, strerror(errno)); + close(s); + return (-1); + } + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(port); + if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + fprintf(stderr, "bind() failed, errno=%d msg=%s\n", errno, strerror(errno)); + close(s); + return (-1); + } + + if (listen(s, 1024) < 0) { + close(s); + return (-1); + } + + return (s); +} +/*-------------------------------------------------------------------------*/ +int CreateConnectionSocket(in_addr_t netAddr, int portNum, int nonBlocking) +{ + struct sockaddr_in saddr; + int fd; + struct linger doLinger; + int doReuse = 1; + + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + fprintf(stderr, "failed creating socket - %d\n", errno); + return (-1); + } + + /* don't linger on close */ + doLinger.l_onoff = doLinger.l_linger = 0; + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &doLinger, sizeof(doLinger)) == -1) { + close(fd); + return (-1); + } + + /* reuse addresses */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &doReuse, sizeof(doReuse)) == -1) { + close(fd); + return (-1); + } + + if (nonBlocking) { + if (fcntl(fd, F_SETFL, O_NDELAY) < 0) { + fprintf(stderr, "failed fcntl'ing socket - %d\n", errno); + close(fd); + return (-1); + } + } + + saddr.sin_family = AF_INET; + saddr.sin_addr.s_addr = netAddr; + saddr.sin_port = htons(portNum); + + if (connect(fd, (struct sockaddr *)&saddr, sizeof(struct sockaddr_in)) < 0) { + if (errno == EINPROGRESS) + return (fd); + fprintf(stderr, "failed connecting socket addr=%s port %d - errno %d\n", inet_ntoa(saddr.sin_addr), portNum, errno); + close(fd); + return (-1); + } + + return (fd); +} +/*----------------------------------------------------------------------------*/ +void ParseOptions(int argc, const char **argv, struct Options *ops) +{ + int i, j; + + for (i = 1; i < argc; i++) { + for (j = 0; ops[j].op_name; j++) { + if (strcmp(ops[j].op_name, argv[i]) == 0) { + if (i + 1 >= argc) { + fprintf(stderr, "no value provided for %s option\n", argv[i]); + exit(-1); + } + *(ops[j].op_varptr) = (char *)(uintptr_t)argv[++i]; + break; + } + } + if (ops[j].op_name == NULL) { + fprintf(stderr, "option %s is not supported\n", argv[i]); + exit(-1); + } + } +} +/*----------------------------------------------------------------------------*/ +void PrintOptions(const struct Options *ops, int printVal) +{ + int i; + + if (printVal) { + /* for printing option values */ + printf("The value for each option is as follows:\n"); + } else { + /* for explaining the options */ + printf("Here is the list of allowable options:\n"); + } + for (i = 0; ops[i].op_name; i++) { + printf("%s: %s\n", ops[i].op_name, printVal ? *ops[i].op_varptr : ops[i].op_comment); + } +} +/*----------------------------------------------------------------------------*/ +char *GetHeaderString(const char *buf, const char *header, int hdrsize) +{ +#define SKIP_SPACE(x) \ + while ((*(x)) && isspace((*(x)))) \ + (x)++; + char *temp = strstr(buf, header); + + if (temp) { + temp += hdrsize; + SKIP_SPACE(temp); + if (*temp) + return (temp); + } + return (NULL); +} +/*----------------------------------------------------------------------------*/ +int GetHeaderLong(const char *buf, const char *header, int hdrsize, long int *val) +{ + long int temp_val; + char *temp; + + if ((temp = GetHeaderString(buf, header, hdrsize)) != NULL) { + temp_val = strtol(temp, NULL, 10); + if (errno != ERANGE && errno != EINVAL) { + *val = temp_val; + return (TRUE); + } + } + return (FALSE); +} +/*----------------------------------------------------------------------------*/ +int mystrtol(const char *nptr, int base) +{ + (void)base; + int rval; + char *endptr; + + errno = 0; + rval = strtol(nptr, &endptr, 10); + /* check for strtol errors */ + if ((errno == ERANGE && (rval == INT_MAX || rval == INT_MIN)) || (errno != 0 && rval == 0)) { + perror("strtol"); + exit(EXIT_FAILURE); + } + if (endptr == nptr) { + fprintf(stderr, "Parsing strtol error!\n"); + exit(EXIT_FAILURE); + } + + return rval; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/util/tdate_parse.c b/lib/flash/util/tdate_parse.c new file mode 100644 index 0000000..24de0b0 --- /dev/null +++ b/lib/flash/util/tdate_parse.c @@ -0,0 +1,299 @@ +/* tdate_parse - parse string dates into internal form, stripped-down version +** +** Copyright (C) 1995 by Jef Poskanzer . All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +** SUCH DAMAGE. +*/ + +/* This is a stripped-down version of date_parse.c, available at +** http://www.acme.com/software/date_parse/ +*/ + +#include + +#include +#ifdef HAVE_MEMORY_H +#include +#endif +#include +#include +#include +#include + +#include "tdate_parse.h" + +struct strlong { + const char *s; + long l; +}; + +static void pound_case(char *str) +{ + for (; *str != '\0'; ++str) { + if (isupper(*str)) + *str = tolower(*str); + } +} + +static int strlong_compare(const void *v1, const void *v2) +{ + return strcmp(((const struct strlong *)v1)->s, ((const struct strlong *)v2)->s); +} + +static int strlong_search(char *str, struct strlong *tab, int n, long *lP) +{ + int i, h, l, r; + + l = 0; + h = n - 1; + for (;;) { + i = (h + l) / 2; + r = strcmp(str, tab[i].s); + if (r < 0) + h = i - 1; + else if (r > 0) + l = i + 1; + else { + *lP = tab[i].l; + return 1; + } + if (h < l) + return 0; + } +} + +static int scan_wday(char *str_wday, long *tm_wdayP) +{ + static struct strlong wday_tab[] = { + { "sun", 0 }, { "sunday", 0 }, { "mon", 1 }, { "monday", 1 }, { "tue", 2 }, + { "tuesday", 2 }, { "wed", 3 }, { "wednesday", 3 }, { "thu", 4 }, { "thursday", 4 }, + { "fri", 5 }, { "friday", 5 }, { "sat", 6 }, { "saturday", 6 }, + }; + static int sorted = 0; + + if (!sorted) { + (void)qsort(wday_tab, sizeof(wday_tab) / sizeof(struct strlong), sizeof(struct strlong), strlong_compare); + sorted = 1; + } + pound_case(str_wday); + return strlong_search(str_wday, wday_tab, sizeof(wday_tab) / sizeof(struct strlong), tm_wdayP); +} + +static int scan_mon(char *str_mon, long *tm_monP) +{ + static struct strlong mon_tab[] = { + { "jan", 0 }, { "january", 0 }, { "feb", 1 }, { "february", 1 }, { "mar", 2 }, { "march", 2 }, + { "apr", 3 }, { "april", 3 }, { "may", 4 }, { "jun", 5 }, { "june", 5 }, { "jul", 6 }, + { "july", 6 }, { "aug", 7 }, { "august", 7 }, { "sep", 8 }, { "september", 8 }, { "oct", 9 }, + { "october", 9 }, { "nov", 10 }, { "november", 10 }, { "dec", 11 }, { "december", 11 }, + }; + static int sorted = 0; + + if (!sorted) { + (void)qsort(mon_tab, sizeof(mon_tab) / sizeof(struct strlong), sizeof(struct strlong), strlong_compare); + sorted = 1; + } + pound_case(str_mon); + return strlong_search(str_mon, mon_tab, sizeof(mon_tab) / sizeof(struct strlong), tm_monP); +} + +static int is_leap(int year) +{ + return year % 400 ? (year % 100 ? (year % 4 ? 0 : 1) : 0) : 1; +} + +/* Basically the same as mktime(). */ +static time_t tm_to_time(struct tm *tmP) +{ + time_t t; + static int monthtab[12] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }; + + /* Years since epoch, converted to days. */ + t = (tmP->tm_year - 70) * 365; + /* Leap days for previous years. */ + t += (tmP->tm_year - 1 - 68) / 4; /* -1: don't count this year */ + /* 100-divisible year is not a leap year + 400-divisible year is a leap year */ + if (tmP->tm_year > 200) + t -= (tmP->tm_year - 1 - 100) / 100; + if (tmP->tm_year > 500) + t += (tmP->tm_year - 1 - 100) / 400; + + /* Days for the beginning of this month. */ + t += monthtab[tmP->tm_mon]; + /* Leap day for this year. */ + if (tmP->tm_mon >= 2 && is_leap(tmP->tm_year)) + ++t; + /* Days since the beginning of this month. */ + t += tmP->tm_mday - 1; /* 1-based field */ + /* Hours, minutes, and seconds. */ + t = t * 24 + tmP->tm_hour; + t = t * 60 + tmP->tm_min; + t = t * 60 + tmP->tm_sec; + + return t; +} + +time_t httpdate_to_timet(const char *str) +{ + struct tm tm; + const char *cp; + char str_mon[500], str_wday[500]; + int tm_sec, tm_min, tm_hour, tm_mday, tm_year; + long tm_mon, tm_wday; + time_t t; + + /* Initialize. */ + memset((char *)&tm, 0, sizeof(struct tm)); + + /* Skip initial whitespace ourselves - sscanf is clumsy at this. */ + for (cp = str; *cp == ' ' || *cp == '\t'; ++cp) + ; + + /* And do the sscanfs. WARNING: you can add more formats here, + ** but be careful! You can easily screw up the parsing of existing + ** formats when you add new ones. The order is important. + */ + + /* DD-mth-YY HH:MM:SS GMT */ + if (sscanf(cp, "%d-%[a-zA-Z]-%d %d:%d:%d GMT", &tm_mday, str_mon, &tm_year, &tm_hour, &tm_min, &tm_sec) == 6 && + scan_mon(str_mon, &tm_mon)) { + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + } + + /* DD mth YY HH:MM:SS GMT */ + else if (sscanf(cp, "%d %[a-zA-Z] %d %d:%d:%d GMT", &tm_mday, str_mon, &tm_year, &tm_hour, &tm_min, &tm_sec) == 6 && + scan_mon(str_mon, &tm_mon)) { + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + } + + /* HH:MM:SS GMT DD-mth-YY */ + else if (sscanf(cp, "%d:%d:%d GMT %d-%[a-zA-Z]-%d", &tm_hour, &tm_min, &tm_sec, &tm_mday, str_mon, &tm_year) == 6 && + scan_mon(str_mon, &tm_mon)) { + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + } + + /* HH:MM:SS GMT DD mth YY */ + else if (sscanf(cp, "%d:%d:%d GMT %d %[a-zA-Z] %d", &tm_hour, &tm_min, &tm_sec, &tm_mday, str_mon, &tm_year) == 6 && + scan_mon(str_mon, &tm_mon)) { + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + } + + /* wdy, DD-mth-YY HH:MM:SS GMT */ + else if (sscanf(cp, "%[a-zA-Z], %d-%[a-zA-Z]-%d %d:%d:%d GMT", str_wday, &tm_mday, str_mon, &tm_year, &tm_hour, &tm_min, + &tm_sec) == 7 && + scan_wday(str_wday, &tm_wday) && scan_mon(str_mon, &tm_mon)) { + tm.tm_wday = tm_wday; + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + } + + /* wdy, DD mth YY HH:MM:SS GMT */ + else if (sscanf(cp, "%[a-zA-Z], %d %[a-zA-Z] %d %d:%d:%d GMT", str_wday, &tm_mday, str_mon, &tm_year, &tm_hour, &tm_min, + &tm_sec) == 7 && + scan_wday(str_wday, &tm_wday) && scan_mon(str_mon, &tm_mon)) { + tm.tm_wday = tm_wday; + tm.tm_mday = tm_mday; + tm.tm_mon = tm_mon; + tm.tm_year = tm_year; + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + } + + /* wdy mth DD HH:MM:SS GMT YY */ + else if (sscanf(cp, "%[a-zA-Z] %[a-zA-Z] %d %d:%d:%d GMT %d", str_wday, str_mon, &tm_mday, &tm_hour, &tm_min, &tm_sec, + &tm_year) == 7 && + scan_wday(str_wday, &tm_wday) && scan_mon(str_mon, &tm_mon)) { + tm.tm_wday = tm_wday; + tm.tm_mon = tm_mon; + tm.tm_mday = tm_mday; + tm.tm_hour = tm_hour; + tm.tm_min = tm_min; + tm.tm_sec = tm_sec; + tm.tm_year = tm_year; + } else + return (time_t)-1; + + if (tm.tm_year > 1900) + tm.tm_year -= 1900; + else if (tm.tm_year < 70) + tm.tm_year += 100; + + t = tm_to_time(&tm); + + return t; +} + +/* + Convert 't' (in time_t format) into the HTTP date format + + t: input (epoch-based time) + str: output string that holds the HTTP date strinng + strlen: the buffer size of str + + + 0 : in case of successful conversion + -1 : otherwise + by KyoungSoo Park +*/ +int timet_to_httpdate(time_t t, char *str, int strlen) +{ + static const char *day_of_week[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; + + static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; + struct tm gm; + + if (gmtime_r(&t, &gm) == NULL) + return (-1); + + /* example date: "Sat, 26 Mar 2011 05:53:57 GMT" */ + if (snprintf(str, strlen, "%s, %02d %s %4d %02d:%02d:%02d GMT", day_of_week[gm.tm_wday], gm.tm_mday, months[gm.tm_mon], + gm.tm_year + 1900, gm.tm_hour, gm.tm_min, gm.tm_sec) == strlen) + /* probably str has an insufficient buffer size */ + return (-1); + return (0); +} diff --git a/meson.build b/meson.build index e5d8c01..7c79d63 100644 --- a/meson.build +++ b/meson.build @@ -56,6 +56,11 @@ if cc.has_header('ncurses.h', required: true) add_project_link_arguments('-lncurses', language: 'c') endif +# use libnuma library +if cc.has_header('numa.h', required: true) + add_project_link_arguments('-lnuma', language: 'c') +endif + add_project_link_arguments('-ldl', language: 'c') xdp_dep = dependency( From 215f33b0de86a5780199d95c3232685507c8cfbf Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 10 Sep 2025 17:16:13 +0530 Subject: [PATCH 21/43] feat: initial mtcp support for flash --- lib/flash/meson.build | 7 + lib/flash/mtcp/addr_pool.c | 368 +++++ lib/flash/mtcp/api.c | 1640 +++++++++++++++++++ lib/flash/mtcp/arp.c | 386 +++++ lib/flash/mtcp/ccp.c | 326 ++++ lib/flash/mtcp/clock.c | 81 + lib/flash/mtcp/config.c | 770 +++++++++ lib/flash/mtcp/core.c | 1657 ++++++++++++++++++++ lib/flash/mtcp/cpu.c | 144 ++ lib/flash/mtcp/debug.c | 286 ++++ lib/flash/mtcp/dpdk_module.c | 942 +++++++++++ lib/flash/mtcp/eth_in.c | 84 + lib/flash/mtcp/eth_out.c | 110 ++ lib/flash/mtcp/eventpoll.c | 633 ++++++++ lib/flash/mtcp/fhash.c | 221 +++ lib/flash/mtcp/flash_module.c | 316 ++++ lib/flash/mtcp/icmp.c | 178 +++ lib/flash/mtcp/include/addr_pool.h | 65 + lib/flash/mtcp/include/arp.h | 51 + lib/flash/mtcp/include/ccp.h | 68 + lib/flash/mtcp/include/clock.h | 44 + lib/flash/mtcp/include/config.h | 69 + lib/flash/mtcp/include/cpu.h | 39 + lib/flash/mtcp/include/debug.h | 272 ++++ lib/flash/mtcp/include/eth_in.h | 39 + lib/flash/mtcp/include/eth_out.h | 45 + lib/flash/mtcp/include/eventpoll.h | 81 + lib/flash/mtcp/include/fhash.h | 79 + lib/flash/mtcp/include/icmp.h | 79 + lib/flash/mtcp/include/io_module.h | 160 ++ lib/flash/mtcp/include/ip_in.h | 39 + lib/flash/mtcp/include/ip_out.h | 47 + lib/flash/mtcp/include/logger.h | 74 + lib/flash/mtcp/include/memory_mgt.h | 68 + lib/flash/mtcp/include/mtcp.h | 380 +++++ lib/flash/mtcp/include/mtcp_api.h | 144 ++ lib/flash/mtcp/include/mtcp_epoll.h | 92 ++ lib/flash/mtcp/include/netmap.h | 640 ++++++++ lib/flash/mtcp/include/netmap_user.h | 968 ++++++++++++ lib/flash/mtcp/include/pacing.h | 63 + lib/flash/mtcp/include/pipe.h | 45 + lib/flash/mtcp/include/ps.h | 346 ++++ lib/flash/mtcp/include/rss.h | 40 + lib/flash/mtcp/include/socket.h | 87 + lib/flash/mtcp/include/stat.h | 110 ++ lib/flash/mtcp/include/tcp_in.h | 149 ++ lib/flash/mtcp/include/tcp_out.h | 67 + lib/flash/mtcp/include/tcp_rb_frag_queue.h | 49 + lib/flash/mtcp/include/tcp_ring_buffer.h | 100 ++ lib/flash/mtcp/include/tcp_sb_queue.h | 49 + lib/flash/mtcp/include/tcp_send_buffer.h | 69 + lib/flash/mtcp/include/tcp_stream.h | 276 ++++ lib/flash/mtcp/include/tcp_stream_queue.h | 105 ++ lib/flash/mtcp/include/tcp_util.h | 69 + lib/flash/mtcp/include/timer.h | 71 + lib/flash/mtcp/io_module.c | 749 +++++++++ lib/flash/mtcp/ip_in.c | 92 ++ lib/flash/mtcp/ip_out.c | 199 +++ lib/flash/mtcp/logger.c | 193 +++ lib/flash/mtcp/memory_mgt.c | 232 +++ lib/flash/mtcp/meson.build | 46 + lib/flash/mtcp/netmap_module.c | 299 ++++ lib/flash/mtcp/onvm_module.c | 579 +++++++ lib/flash/mtcp/pacing.c | 127 ++ lib/flash/mtcp/pipe.c | 437 ++++++ lib/flash/mtcp/psio_module.c | 426 +++++ lib/flash/mtcp/rss.c | 135 ++ lib/flash/mtcp/socket.c | 118 ++ lib/flash/mtcp/tcp_in.c | 1290 +++++++++++++++ lib/flash/mtcp/tcp_out.c | 1072 +++++++++++++ lib/flash/mtcp/tcp_rb_frag_queue.c | 123 ++ lib/flash/mtcp/tcp_ring_buffer.c | 432 +++++ lib/flash/mtcp/tcp_sb_queue.c | 123 ++ lib/flash/mtcp/tcp_send_buffer.c | 207 +++ lib/flash/mtcp/tcp_stream.c | 657 ++++++++ lib/flash/mtcp/tcp_stream_queue.c | 203 +++ lib/flash/mtcp/tcp_util.c | 347 ++++ lib/flash/mtcp/timer.c | 515 ++++++ meson.build | 21 + meson_options.txt | 3 + 80 files changed, 22012 insertions(+) create mode 100644 lib/flash/mtcp/addr_pool.c create mode 100644 lib/flash/mtcp/api.c create mode 100644 lib/flash/mtcp/arp.c create mode 100644 lib/flash/mtcp/ccp.c create mode 100644 lib/flash/mtcp/clock.c create mode 100644 lib/flash/mtcp/config.c create mode 100644 lib/flash/mtcp/core.c create mode 100644 lib/flash/mtcp/cpu.c create mode 100644 lib/flash/mtcp/debug.c create mode 100644 lib/flash/mtcp/dpdk_module.c create mode 100644 lib/flash/mtcp/eth_in.c create mode 100644 lib/flash/mtcp/eth_out.c create mode 100644 lib/flash/mtcp/eventpoll.c create mode 100644 lib/flash/mtcp/fhash.c create mode 100644 lib/flash/mtcp/flash_module.c create mode 100644 lib/flash/mtcp/icmp.c create mode 100644 lib/flash/mtcp/include/addr_pool.h create mode 100644 lib/flash/mtcp/include/arp.h create mode 100644 lib/flash/mtcp/include/ccp.h create mode 100644 lib/flash/mtcp/include/clock.h create mode 100644 lib/flash/mtcp/include/config.h create mode 100644 lib/flash/mtcp/include/cpu.h create mode 100644 lib/flash/mtcp/include/debug.h create mode 100644 lib/flash/mtcp/include/eth_in.h create mode 100644 lib/flash/mtcp/include/eth_out.h create mode 100644 lib/flash/mtcp/include/eventpoll.h create mode 100644 lib/flash/mtcp/include/fhash.h create mode 100644 lib/flash/mtcp/include/icmp.h create mode 100644 lib/flash/mtcp/include/io_module.h create mode 100644 lib/flash/mtcp/include/ip_in.h create mode 100644 lib/flash/mtcp/include/ip_out.h create mode 100644 lib/flash/mtcp/include/logger.h create mode 100644 lib/flash/mtcp/include/memory_mgt.h create mode 100644 lib/flash/mtcp/include/mtcp.h create mode 100644 lib/flash/mtcp/include/mtcp_api.h create mode 100644 lib/flash/mtcp/include/mtcp_epoll.h create mode 100644 lib/flash/mtcp/include/netmap.h create mode 100644 lib/flash/mtcp/include/netmap_user.h create mode 100644 lib/flash/mtcp/include/pacing.h create mode 100644 lib/flash/mtcp/include/pipe.h create mode 100644 lib/flash/mtcp/include/ps.h create mode 100644 lib/flash/mtcp/include/rss.h create mode 100644 lib/flash/mtcp/include/socket.h create mode 100644 lib/flash/mtcp/include/stat.h create mode 100644 lib/flash/mtcp/include/tcp_in.h create mode 100644 lib/flash/mtcp/include/tcp_out.h create mode 100644 lib/flash/mtcp/include/tcp_rb_frag_queue.h create mode 100644 lib/flash/mtcp/include/tcp_ring_buffer.h create mode 100644 lib/flash/mtcp/include/tcp_sb_queue.h create mode 100644 lib/flash/mtcp/include/tcp_send_buffer.h create mode 100644 lib/flash/mtcp/include/tcp_stream.h create mode 100644 lib/flash/mtcp/include/tcp_stream_queue.h create mode 100644 lib/flash/mtcp/include/tcp_util.h create mode 100644 lib/flash/mtcp/include/timer.h create mode 100644 lib/flash/mtcp/io_module.c create mode 100644 lib/flash/mtcp/ip_in.c create mode 100644 lib/flash/mtcp/ip_out.c create mode 100644 lib/flash/mtcp/logger.c create mode 100644 lib/flash/mtcp/memory_mgt.c create mode 100644 lib/flash/mtcp/meson.build create mode 100644 lib/flash/mtcp/netmap_module.c create mode 100644 lib/flash/mtcp/onvm_module.c create mode 100644 lib/flash/mtcp/pacing.c create mode 100644 lib/flash/mtcp/pipe.c create mode 100644 lib/flash/mtcp/psio_module.c create mode 100644 lib/flash/mtcp/rss.c create mode 100644 lib/flash/mtcp/socket.c create mode 100644 lib/flash/mtcp/tcp_in.c create mode 100644 lib/flash/mtcp/tcp_out.c create mode 100644 lib/flash/mtcp/tcp_rb_frag_queue.c create mode 100644 lib/flash/mtcp/tcp_ring_buffer.c create mode 100644 lib/flash/mtcp/tcp_sb_queue.c create mode 100644 lib/flash/mtcp/tcp_send_buffer.c create mode 100644 lib/flash/mtcp/tcp_stream.c create mode 100644 lib/flash/mtcp/tcp_stream_queue.c create mode 100644 lib/flash/mtcp/tcp_util.c create mode 100644 lib/flash/mtcp/timer.c diff --git a/lib/flash/meson.build b/lib/flash/meson.build index b884837..7766ee5 100644 --- a/lib/flash/meson.build +++ b/lib/flash/meson.build @@ -7,6 +7,13 @@ special_dirs = [ dirs = ['uds', 'common', 'monitor', 'params', 'pool', 'nf', 'util'] +if get_option('enable_mtcp') + dirs += ['mtcp'] + message('mTCP support enabled') +else + message('mTCP support disabled') +endif + foreach special_dir : special_dirs sources = [] headers = [] diff --git a/lib/flash/mtcp/addr_pool.c b/lib/flash/mtcp/addr_pool.c new file mode 100644 index 0000000..99baf80 --- /dev/null +++ b/lib/flash/mtcp/addr_pool.c @@ -0,0 +1,368 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include "addr_pool.h" +#include "rss.h" +#include "debug.h" + +/*----------------------------------------------------------------------------*/ +struct addr_entry { + struct sockaddr_in addr; + TAILQ_ENTRY(addr_entry) addr_link; +}; +/*----------------------------------------------------------------------------*/ +struct addr_map { + struct addr_entry *addrmap[MAX_PORT]; +}; +/*----------------------------------------------------------------------------*/ +struct addr_pool { + struct addr_entry *pool; /* address pool */ + struct addr_map *mapper; /* address map */ + + uint32_t addr_base; /* in host order */ + int num_addr; /* number of addresses in use */ + + int num_entry; + int num_free; + int num_used; + + pthread_mutex_t lock; + TAILQ_HEAD(, addr_entry) free_list; + TAILQ_HEAD(, addr_entry) used_list; +}; +/*----------------------------------------------------------------------------*/ +addr_pool_t CreateAddressPool(in_addr_t addr_base, int num_addr) +{ + struct addr_pool *ap; + int num_entry; + int i, j, cnt; + in_addr_t addr; + uint32_t addr_h; + + ap = (addr_pool_t)calloc(1, sizeof(struct addr_pool)); + if (!ap) + return NULL; + + /* initialize address pool */ + num_entry = num_addr * (MAX_PORT - MIN_PORT); + ap->pool = (struct addr_entry *)calloc(num_entry, sizeof(struct addr_entry)); + if (!ap->pool) { + free(ap); + return NULL; + } + + /* initialize address map */ + ap->mapper = (struct addr_map *)calloc(num_addr, sizeof(struct addr_map)); + if (!ap->mapper) { + free(ap->pool); + free(ap); + return NULL; + } + + TAILQ_INIT(&ap->free_list); + TAILQ_INIT(&ap->used_list); + + if (pthread_mutex_init(&ap->lock, NULL)) { + free(ap->pool); + free(ap); + return NULL; + } + + pthread_mutex_lock(&ap->lock); + + ap->addr_base = ntohl(addr_base); + ap->num_addr = num_addr; + + cnt = 0; + for (i = 0; i < num_addr; i++) { + addr_h = ap->addr_base + i; + addr = htonl(addr_h); + for (j = MIN_PORT; j < MAX_PORT; j++) { + ap->pool[cnt].addr.sin_addr.s_addr = addr; + ap->pool[cnt].addr.sin_port = htons(j); + ap->mapper[i].addrmap[j] = &ap->pool[cnt]; + + TAILQ_INSERT_TAIL(&ap->free_list, &ap->pool[cnt], addr_link); + + if ((++cnt) >= num_entry) + break; + } + } + ap->num_entry = cnt; + ap->num_free = cnt; + ap->num_used = 0; + + pthread_mutex_unlock(&ap->lock); + + return ap; +} +/*----------------------------------------------------------------------------*/ +addr_pool_t CreateAddressPoolPerCore(int core, int num_queues, in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_port_t dport) +{ + struct addr_pool *ap; + int num_entry; + int i, j, cnt; + in_addr_t saddr; + uint32_t saddr_h, daddr_h; + uint16_t sport_h, dport_h; + int rss_core; +#if 0 + uint8_t endian_check = (current_iomodule_func == &dpdk_module_func) ? + 0 : 1; +#else + uint8_t endian_check = FetchEndianType(); +#endif + + ap = (addr_pool_t)calloc(1, sizeof(struct addr_pool)); + if (!ap) + return NULL; + + /* initialize address pool */ + num_entry = (num_addr * (MAX_PORT - MIN_PORT)) / num_queues; + ap->pool = (struct addr_entry *)calloc(num_entry, sizeof(struct addr_entry)); + if (!ap->pool) { + free(ap); + return NULL; + } + + /* initialize address map */ + ap->mapper = (struct addr_map *)calloc(num_addr, sizeof(struct addr_map)); + if (!ap->mapper) { + free(ap->pool); + free(ap); + return NULL; + } + + TAILQ_INIT(&ap->free_list); + TAILQ_INIT(&ap->used_list); + + if (pthread_mutex_init(&ap->lock, NULL)) { + free(ap->pool); + free(ap); + return NULL; + } + + pthread_mutex_lock(&ap->lock); + + ap->addr_base = ntohl(saddr_base); + ap->num_addr = num_addr; + daddr_h = ntohl(daddr); + dport_h = ntohs(dport); + + /* search address space to get RSS-friendly addresses */ + cnt = 0; + for (i = 0; i < num_addr; i++) { + saddr_h = ap->addr_base + i; + saddr = htonl(saddr_h); + for (j = MIN_PORT; j < MAX_PORT; j++) { + if (cnt >= num_entry) + break; + + sport_h = j; + rss_core = GetRSSCPUCore(daddr_h, saddr_h, dport_h, sport_h, num_queues, endian_check); + if (rss_core != core) + continue; + + ap->pool[cnt].addr.sin_addr.s_addr = saddr; + ap->pool[cnt].addr.sin_port = htons(sport_h); + ap->mapper[i].addrmap[j] = &ap->pool[cnt]; + TAILQ_INSERT_TAIL(&ap->free_list, &ap->pool[cnt], addr_link); + cnt++; + } + } + + ap->num_entry = cnt; + ap->num_free = cnt; + ap->num_used = 0; + //fprintf(stderr, "CPU %d: Created %d address entries.\n", core, cnt); + if (ap->num_entry < CONFIG.max_concurrency) { + fprintf(stderr, + "[WARINING] Available # addresses (%d) is smaller than" + " the max concurrency (%d).\n", + ap->num_entry, CONFIG.max_concurrency); + } + + pthread_mutex_unlock(&ap->lock); + + return ap; +} +/*----------------------------------------------------------------------------*/ +void DestroyAddressPool(addr_pool_t ap) +{ + if (!ap) + return; + + if (ap->pool) { + free(ap->pool); + ap->pool = NULL; + } + + if (ap->mapper) { + free(ap->mapper); + ap->mapper = NULL; + } + + pthread_mutex_destroy(&ap->lock); + + free(ap); +} +/*----------------------------------------------------------------------------*/ +int FetchAddress(addr_pool_t ap, int core, int num_queues, const struct sockaddr_in *daddr, struct sockaddr_in *saddr) +{ + struct addr_entry *walk, *next; + int rss_core; + int ret = -1; +#if 0 + uint8_t endian_check = (current_iomodule_func == &dpdk_module_func) ? + 0 : 1; +#else + uint8_t endian_check = FetchEndianType(); +#endif + + if (!ap || !daddr || !saddr) + return -1; + + pthread_mutex_lock(&ap->lock); + + walk = TAILQ_FIRST(&ap->free_list); + while (walk) { + next = TAILQ_NEXT(walk, addr_link); + + if (saddr->sin_addr.s_addr != INADDR_ANY && walk->addr.sin_addr.s_addr != saddr->sin_addr.s_addr) { + walk = next; + continue; + } + + if (saddr->sin_port != INPORT_ANY && walk->addr.sin_port != saddr->sin_port) { + walk = next; + continue; + } + + rss_core = GetRSSCPUCore(ntohl(walk->addr.sin_addr.s_addr), ntohl(daddr->sin_addr.s_addr), ntohs(walk->addr.sin_port), + ntohs(daddr->sin_port), num_queues, endian_check); + + if (core == rss_core) + break; + + walk = next; + } + + if (walk) { + *saddr = walk->addr; + TAILQ_REMOVE(&ap->free_list, walk, addr_link); + TAILQ_INSERT_TAIL(&ap->used_list, walk, addr_link); + ap->num_free--; + ap->num_used++; + ret = 0; + } + + pthread_mutex_unlock(&ap->lock); + + return ret; +} +/*----------------------------------------------------------------------------*/ +int FetchAddressPerCore(addr_pool_t ap, int core, int num_queues, const struct sockaddr_in *daddr, struct sockaddr_in *saddr) +{ + (void)core; + (void)num_queues; + struct addr_entry *walk; + int ret = -1; + + if (!ap || !daddr || !saddr) + return -1; + + pthread_mutex_lock(&ap->lock); + + /* we don't need to calculate RSSCPUCore if mtcp_init_rss is called */ + walk = TAILQ_FIRST(&ap->free_list); + if (walk) { + *saddr = walk->addr; + TAILQ_REMOVE(&ap->free_list, walk, addr_link); + TAILQ_INSERT_TAIL(&ap->used_list, walk, addr_link); + ap->num_free--; + ap->num_used++; + ret = 0; + } + + pthread_mutex_unlock(&ap->lock); + + return ret; +} +/*----------------------------------------------------------------------------*/ +int FreeAddress(addr_pool_t ap, const struct sockaddr_in *addr) +{ + struct addr_entry *walk, *next; + int ret = -1; + + if (!ap || !addr) + return -1; + + pthread_mutex_lock(&ap->lock); + + if (ap->mapper) { + uint32_t addr_h = ntohl(addr->sin_addr.s_addr); + uint16_t port_h = ntohs(addr->sin_port); + int index = addr_h - ap->addr_base; + + if (index >= 0 && index < ap->num_addr) { + walk = ap->mapper[addr_h - ap->addr_base].addrmap[port_h]; + } else { + walk = NULL; + } + + } else { + walk = TAILQ_FIRST(&ap->used_list); + while (walk) { + next = TAILQ_NEXT(walk, addr_link); + if (addr->sin_port == walk->addr.sin_port && addr->sin_addr.s_addr == walk->addr.sin_addr.s_addr) { + break; + } + + walk = next; + } + } + + if (walk) { + TAILQ_REMOVE(&ap->used_list, walk, addr_link); + TAILQ_INSERT_TAIL(&ap->free_list, walk, addr_link); + ap->num_free++; + ap->num_used--; + ret = 0; + } + + pthread_mutex_unlock(&ap->lock); + + return ret; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/api.c b/lib/flash/mtcp/api.c new file mode 100644 index 0000000..e3dac72 --- /dev/null +++ b/lib/flash/mtcp/api.c @@ -0,0 +1,1640 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "mtcp.h" +#include "mtcp_api.h" +#include "tcp_in.h" +#include "tcp_stream.h" +#include "tcp_out.h" +#include "ip_out.h" +#include "eventpoll.h" +#include "pipe.h" +#include "fhash.h" +#include "addr_pool.h" +#include "rss.h" +#include "config.h" +#include "debug.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/*----------------------------------------------------------------------------*/ +static inline int mtcp_is_connected(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + (void)mtcp; + if (!cur_stream) { + TRACE_API("Stream does not exist\n"); + return FALSE; + } + if (cur_stream->state != TCP_ST_ESTABLISHED) { + TRACE_API("Stream %d not ESTABLISHED. state: %s\n", cur_stream->id, TCPStateToString(cur_stream)); + return FALSE; + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +inline mtcp_manager_t GetMTCPManager(mctx_t mctx) +{ + if (!mctx) { + errno = EINVAL; + return NULL; + } + + if (mctx->cpu < 0 || mctx->cpu >= num_cpus) { + errno = EINVAL; + return NULL; + } + + if (g_mtcp[mctx->cpu]->ctx->done || g_mtcp[mctx->cpu]->ctx->exit) { + errno = EPERM; + return NULL; + } + + return g_mtcp[mctx->cpu]; +} +/*----------------------------------------------------------------------------*/ +static inline int GetSocketError(socket_map_t socket, void *optval, socklen_t *optlen) +{ + tcp_stream *cur_stream; + + if (!socket->stream) { + errno = EBADF; + return -1; + } + + cur_stream = socket->stream; + if (cur_stream->state == TCP_ST_CLOSED) { + if (cur_stream->close_reason == TCP_TIMEDOUT || cur_stream->close_reason == TCP_CONN_FAIL || + cur_stream->close_reason == TCP_CONN_LOST) { + *(int *)optval = ETIMEDOUT; + *optlen = sizeof(int); + + return 0; + } + } + + if (cur_stream->state == TCP_ST_CLOSE_WAIT || cur_stream->state == TCP_ST_CLOSED) { + if (cur_stream->close_reason == TCP_RESET) { + *(int *)optval = ECONNRESET; + *optlen = sizeof(int); + + return 0; + } + } + + if (cur_stream->state == TCP_ST_SYN_SENT && errno == EINPROGRESS) { + *(int *)optval = errno; + *optlen = sizeof(int); + return -1; + } + + /* + * `base case`: If socket sees no so_error, then + * this also means close_reason will always be + * TCP_NOT_CLOSED. + */ + if (cur_stream->close_reason == TCP_NOT_CLOSED) { + *(int *)optval = 0; + *optlen = sizeof(int); + + return 0; + } + + errno = ENOSYS; + return -1; +} +/*----------------------------------------------------------------------------*/ +int mtcp_getsockname(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (*addrlen <= 0) { + TRACE_API("Invalid addrlen: %d\n", *addrlen); + errno = EINVAL; + return -1; + } + + if (socket->socktype != MTCP_SOCK_LISTENER && socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + *(struct sockaddr_in *)addr = socket->saddr; + *addrlen = sizeof(socket->saddr); + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_getpeername(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + struct sockaddr_in *addr_in; + tcp_stream *stream; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (*addrlen <= 0) { + TRACE_API("Invalid addrlen: %d\n", *addrlen); + errno = EINVAL; + return -1; + } + + if (socket->socktype != MTCP_SOCK_LISTENER && socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + stream = socket->stream; + if (!mtcp_is_connected(mtcp, stream)) { + errno = ENOTCONN; + return -1; + } + + addr_in = (struct sockaddr_in *)addr; + addr_in->sin_family = AF_INET; + addr_in->sin_port = stream->dport; + addr_in->sin_addr.s_addr = stream->daddr; + *addrlen = sizeof(*addr_in); + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_getsockopt(mctx_t mctx, int sockid, int level, int optname, void *optval, socklen_t *optlen) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype != MTCP_SOCK_LISTENER && socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + if (level == SOL_SOCKET) { + if (optname == SO_ERROR) { + if (socket->socktype == MTCP_SOCK_STREAM) { + return GetSocketError(socket, optval, optlen); + } + } + } + + errno = ENOSYS; + return -1; +} +/*----------------------------------------------------------------------------*/ +int mtcp_setsockopt(mctx_t mctx, int sockid, int level, int optname, const void *optval, socklen_t optlen) +{ + (void)level; + (void)optname; + (void)optval; + (void)optlen; + mtcp_manager_t mtcp; + socket_map_t socket; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype != MTCP_SOCK_LISTENER && socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_setsock_nonblock(mctx_t mctx, int sockid) +{ + mtcp_manager_t mtcp; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + mtcp->smap[sockid].opts |= MTCP_NONBLOCK; + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_socket_ioctl(mctx_t mctx, int sockid, int request, void *argp) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + /* only support stream socket */ + socket = &mtcp->smap[sockid]; + if (socket->socktype != MTCP_SOCK_STREAM && socket->socktype != MTCP_SOCK_LISTENER) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (!argp) { + errno = EFAULT; + return -1; + } + + if (request == FIONREAD) { + tcp_stream *cur_stream; + struct tcp_ring_buffer *rbuf; + + cur_stream = socket->stream; + if (!cur_stream) { + errno = EBADF; + return -1; + } + rbuf = cur_stream->rcvvar->rcvbuf; + if (rbuf) { + *(int *)argp = rbuf->merged_len; + } else { + *(int *)argp = 0; + } + + } else if (request == FIONBIO) { + int32_t arg = *(int32_t *)argp; + if (arg != 0) + return mtcp_setsock_nonblock(mctx, sockid); + } else { + errno = EINVAL; + return -1; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_socket(mctx_t mctx, int domain, int type, int protocol) +{ + (void)protocol; + mtcp_manager_t mtcp; + socket_map_t socket; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (domain != AF_INET) { + errno = EAFNOSUPPORT; + return -1; + } + + if (type == SOCK_STREAM) { + type = (int)MTCP_SOCK_STREAM; + } else { + errno = EINVAL; + return -1; + } + + socket = AllocateSocket(mctx, type, FALSE); + if (!socket) { + errno = ENFILE; + return -1; + } + + return socket->id; +} +/*----------------------------------------------------------------------------*/ +int mtcp_bind(mctx_t mctx, int sockid, const struct sockaddr *addr, socklen_t addrlen) +{ + mtcp_manager_t mtcp; + const struct sockaddr_in *addr_in; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype != MTCP_SOCK_STREAM && mtcp->smap[sockid].socktype != MTCP_SOCK_LISTENER) { + TRACE_API("Not a stream socket id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + if (!addr) { + TRACE_API("Socket %d: empty address!\n", sockid); + errno = EINVAL; + return -1; + } + + if (mtcp->smap[sockid].opts & MTCP_ADDR_BIND) { + TRACE_API("Socket %d: adress already bind for this socket.\n", sockid); + errno = EINVAL; + return -1; + } + + /* we only allow bind() for AF_INET address */ + if (addr->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in)) { + TRACE_API("Socket %d: invalid argument!\n", sockid); + errno = EINVAL; + return -1; + } + + /* TODO: validate whether the address is already being used */ + + addr_in = (const struct sockaddr_in *)addr; + mtcp->smap[sockid].saddr = *addr_in; + mtcp->smap[sockid].opts |= MTCP_ADDR_BIND; + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_listen(mctx_t mctx, int sockid, int backlog) +{ + mtcp_manager_t mtcp; + struct tcp_listener *listener; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_STREAM) { + mtcp->smap[sockid].socktype = MTCP_SOCK_LISTENER; + } + + if (mtcp->smap[sockid].socktype != MTCP_SOCK_LISTENER) { + TRACE_API("Not a listening socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + if (backlog <= 0 || backlog > CONFIG.max_concurrency) { + errno = EINVAL; + return -1; + } + + /* check whether we are not already listening on the same port */ + if (ListenerHTSearch(mtcp->listeners, &mtcp->smap[sockid].saddr.sin_port)) { + errno = EADDRINUSE; + return -1; + } + + listener = (struct tcp_listener *)calloc(1, sizeof(struct tcp_listener)); + if (!listener) { + /* errno set from the malloc() */ + return -1; + } + + listener->sockid = sockid; + listener->backlog = backlog; + listener->socket = &mtcp->smap[sockid]; + + if (pthread_cond_init(&listener->accept_cond, NULL)) { + /* errno set internally */ + perror("pthread_cond_init of ctx->accept_cond\n"); + free(listener); + return -1; + } + if (pthread_mutex_init(&listener->accept_lock, NULL)) { + /* errno set internally */ + perror("pthread_mutex_init of ctx->accept_lock\n"); + free(listener); + return -1; + } + + listener->acceptq = CreateStreamQueue(backlog); + if (!listener->acceptq) { + free(listener); + errno = ENOMEM; + return -1; + } + + mtcp->smap[sockid].listener = listener; + ListenerHTInsert(mtcp->listeners, listener); + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_accept(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen) +{ + mtcp_manager_t mtcp; + struct tcp_listener *listener; + socket_map_t socket; + tcp_stream *accepted = NULL; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + /* requires listening socket */ + if (mtcp->smap[sockid].socktype != MTCP_SOCK_LISTENER) { + errno = EINVAL; + return -1; + } + + listener = mtcp->smap[sockid].listener; + + /* dequeue from the acceptq without lock first */ + /* if nothing there, acquire lock and cond_wait */ + accepted = StreamDequeue(listener->acceptq); + if (!accepted) { + if (listener->socket->opts & MTCP_NONBLOCK) { + errno = EAGAIN; + return -1; + + } else { + pthread_mutex_lock(&listener->accept_lock); + while ((accepted = StreamDequeue(listener->acceptq)) == NULL) { + pthread_cond_wait(&listener->accept_cond, &listener->accept_lock); + + if (mtcp->ctx->done || mtcp->ctx->exit) { + pthread_mutex_unlock(&listener->accept_lock); + errno = EINTR; + return -1; + } + } + pthread_mutex_unlock(&listener->accept_lock); + } + } + + if (!accepted) { + TRACE_ERROR("[NEVER HAPPEN] Empty accept queue!\n"); + } + + if (!accepted->socket) { + socket = AllocateSocket(mctx, MTCP_SOCK_STREAM, FALSE); + if (!socket) { + TRACE_ERROR("Failed to create new socket!\n"); + /* TODO: destroy the stream */ + errno = ENFILE; + return -1; + } + socket->stream = accepted; + accepted->socket = socket; + + /* set socket parameters */ + socket->saddr.sin_family = AF_INET; + socket->saddr.sin_port = accepted->dport; + socket->saddr.sin_addr.s_addr = accepted->daddr; + } + + if (!(listener->socket->epoll & MTCP_EPOLLET) && !StreamQueueIsEmpty(listener->acceptq)) + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, listener->socket, MTCP_EPOLLIN); + + TRACE_API("Stream %d accepted.\n", accepted->id); + + if (addr && addrlen) { + struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; + addr_in->sin_family = AF_INET; + addr_in->sin_port = accepted->dport; + addr_in->sin_addr.s_addr = accepted->daddr; + *addrlen = sizeof(struct sockaddr_in); + } + + return accepted->socket->id; +} +/*----------------------------------------------------------------------------*/ +int mtcp_init_rss(mctx_t mctx, in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_addr_t dport) +{ + mtcp_manager_t mtcp; + addr_pool_t ap; + uint8_t is_external; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + errno = EACCES; + return -1; + } + + if (mtcp->ap) { + TRACE_DBG("Destroying already exsiting address pool.\n" + "Are you calling mtcp_init_rss() multiple times?\n"); + DestroyAddressPool(mtcp->ap); + mtcp->ap = NULL; + } + + if (saddr_base == INADDR_ANY) { + int nif_out, eidx; + + /* for the INADDR_ANY, find the output interface for the destination + and set the saddr_base as the ip address of the output interface */ + nif_out = GetOutputInterface(daddr, &is_external); + if (nif_out < 0) { + errno = EINVAL; + TRACE_DBG("Could not determine nif idx!\n"); + return -1; + } + eidx = CONFIG.nif_to_eidx[nif_out]; + saddr_base = CONFIG.eths[eidx].ip_addr; + } + + ap = CreateAddressPoolPerCore(mctx->cpu, num_cpus, saddr_base, num_addr, daddr, dport); + if (!ap) { + errno = ENOMEM; + return -1; + } + + mtcp->ap = ap; + UNUSED(is_external); + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_connect(mctx_t mctx, int sockid, const struct sockaddr *addr, socklen_t addrlen) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + tcp_stream *cur_stream; + const struct sockaddr_in *addr_in; + in_addr_t dip; + in_port_t dport; + int is_dyn_bound = FALSE; + int ret, nif; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype != MTCP_SOCK_STREAM) { + TRACE_API("Not an end socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + if (!addr) { + TRACE_API("Socket %d: empty address!\n", sockid); + errno = EFAULT; + return -1; + } + + /* we only allow bind() for AF_INET address */ + if (addr->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in)) { + TRACE_API("Socket %d: invalid argument!\n", sockid); + errno = EAFNOSUPPORT; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->stream) { + TRACE_API("Socket %d: stream already exist!\n", sockid); + if (socket->stream->state >= TCP_ST_ESTABLISHED) { + errno = EISCONN; + } else { + errno = EALREADY; + } + return -1; + } + + addr_in = (const struct sockaddr_in *)addr; + dip = addr_in->sin_addr.s_addr; + dport = addr_in->sin_port; + + /* address binding */ + if ((socket->opts & MTCP_ADDR_BIND) && socket->saddr.sin_port != INPORT_ANY && socket->saddr.sin_addr.s_addr != INADDR_ANY) { + int rss_core; + uint8_t endian_check = FetchEndianType(); + + rss_core = GetRSSCPUCore(socket->saddr.sin_addr.s_addr, dip, socket->saddr.sin_port, dport, num_queues, endian_check); + + if (rss_core != mctx->cpu) { + errno = EINVAL; + return -1; + } + } else { + if (mtcp->ap) { + ret = FetchAddressPerCore(mtcp->ap, mctx->cpu, num_queues, addr_in, &socket->saddr); + } else { + uint8_t is_external; + nif = GetOutputInterface(dip, &is_external); + if (nif < 0) { + errno = EINVAL; + return -1; + } + ret = FetchAddress(ap[nif], mctx->cpu, num_queues, addr_in, &socket->saddr); + UNUSED(is_external); + } + if (ret < 0) { + errno = EAGAIN; + return -1; + } + socket->opts |= MTCP_ADDR_BIND; + is_dyn_bound = TRUE; + } + + cur_stream = + CreateTCPStream(mtcp, socket, socket->socktype, socket->saddr.sin_addr.s_addr, socket->saddr.sin_port, dip, dport); + if (!cur_stream) { + TRACE_ERROR("Socket %d: failed to create tcp_stream!\n", sockid); + errno = ENOMEM; + return -1; + } + + if (is_dyn_bound) + cur_stream->is_bound_addr = TRUE; + cur_stream->sndvar->cwnd = 1; + cur_stream->sndvar->ssthresh = cur_stream->sndvar->mss * 10; + + cur_stream->state = TCP_ST_SYN_SENT; + TRACE_STATE("Stream %d: TCP_ST_SYN_SENT\n", cur_stream->id); + + SQ_LOCK(&mtcp->ctx->connect_lock); + ret = StreamEnqueue(mtcp->connectq, cur_stream); + SQ_UNLOCK(&mtcp->ctx->connect_lock); + mtcp->wakeup_flag = TRUE; + if (ret < 0) { + TRACE_ERROR("Socket %d: failed to enqueue to conenct queue!\n", sockid); + SQ_LOCK(&mtcp->ctx->destroyq_lock); + StreamEnqueue(mtcp->destroyq, cur_stream); + SQ_UNLOCK(&mtcp->ctx->destroyq_lock); + errno = EAGAIN; + return -1; + } + + /* if nonblocking socket, return EINPROGRESS */ + if (socket->opts & MTCP_NONBLOCK) { + errno = EINPROGRESS; + return -1; + + } else { + while (1) { + if (!cur_stream) { + TRACE_ERROR("STREAM DESTROYED\n"); + errno = ETIMEDOUT; + return -1; + } + if (cur_stream->state > TCP_ST_ESTABLISHED) { + TRACE_ERROR("Socket %d: weird state %s\n", sockid, TCPStateToString(cur_stream)); + // TODO: how to handle this? + errno = ENOSYS; + return -1; + } + + if (cur_stream->state == TCP_ST_ESTABLISHED) { + break; + } + usleep(1000); + } + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +static inline int CloseStreamSocket(mctx_t mctx, int sockid) +{ + mtcp_manager_t mtcp; + tcp_stream *cur_stream; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + cur_stream = mtcp->smap[sockid].stream; + if (!cur_stream) { + TRACE_API("Socket %d: stream does not exist.\n", sockid); + errno = ENOTCONN; + return -1; + } + + if (cur_stream->closed) { + TRACE_API("Socket %d (Stream %u): already closed stream\n", sockid, cur_stream->id); + return 0; + } + cur_stream->closed = TRUE; + + TRACE_API("Stream %d: closing the stream.\n", cur_stream->id); + + cur_stream->socket = NULL; + + if (cur_stream->state == TCP_ST_CLOSED) { + TRACE_API("Stream %d at TCP_ST_CLOSED. destroying the stream.\n", cur_stream->id); + SQ_LOCK(&mtcp->ctx->destroyq_lock); + StreamEnqueue(mtcp->destroyq, cur_stream); + mtcp->wakeup_flag = TRUE; + SQ_UNLOCK(&mtcp->ctx->destroyq_lock); + return 0; + + } else if (cur_stream->state == TCP_ST_SYN_SENT) { +#if 1 + SQ_LOCK(&mtcp->ctx->destroyq_lock); + StreamEnqueue(mtcp->destroyq, cur_stream); + SQ_UNLOCK(&mtcp->ctx->destroyq_lock); + mtcp->wakeup_flag = TRUE; +#endif + return -1; + + } else if (cur_stream->state != TCP_ST_ESTABLISHED && cur_stream->state != TCP_ST_CLOSE_WAIT) { + TRACE_API("Stream %d at state %s\n", cur_stream->id, TCPStateToString(cur_stream)); + errno = EBADF; + return -1; + } + + SQ_LOCK(&mtcp->ctx->close_lock); + cur_stream->sndvar->on_closeq = TRUE; + ret = StreamEnqueue(mtcp->closeq, cur_stream); + mtcp->wakeup_flag = TRUE; + SQ_UNLOCK(&mtcp->ctx->close_lock); + + if (ret < 0) { + TRACE_ERROR("(NEVER HAPPEN) Failed to enqueue the stream to close.\n"); + errno = EAGAIN; + return -1; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +static inline int CloseListeningSocket(mctx_t mctx, int sockid) +{ + mtcp_manager_t mtcp; + struct tcp_listener *listener; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + listener = mtcp->smap[sockid].listener; + if (!listener) { + errno = EINVAL; + return -1; + } + + if (listener->acceptq) { + DestroyStreamQueue(listener->acceptq); + listener->acceptq = NULL; + } + + pthread_mutex_lock(&listener->accept_lock); + pthread_cond_signal(&listener->accept_cond); + pthread_mutex_unlock(&listener->accept_lock); + + pthread_cond_destroy(&listener->accept_cond); + pthread_mutex_destroy(&listener->accept_lock); + + free(listener); + mtcp->smap[sockid].listener = NULL; + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_close(mctx_t mctx, int sockid) +{ + mtcp_manager_t mtcp; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + TRACE_API("Socket %d: mtcp_close called.\n", sockid); + + switch (mtcp->smap[sockid].socktype) { + case MTCP_SOCK_STREAM: + ret = CloseStreamSocket(mctx, sockid); + break; + + case MTCP_SOCK_LISTENER: + ret = CloseListeningSocket(mctx, sockid); + break; + + case MTCP_SOCK_EPOLL: + ret = CloseEpollSocket(mctx, sockid); + break; + + case MTCP_SOCK_PIPE: + ret = PipeClose(mctx, sockid); + break; + + default: + errno = EINVAL; + ret = -1; + break; + } + + FreeSocket(mctx, sockid, FALSE); + + return ret; +} +/*----------------------------------------------------------------------------*/ +// int mtcp_abort(mctx_t mctx, int sockid) +// { +// mtcp_manager_t mtcp; +// tcp_stream *cur_stream; +// int ret; + +// mtcp = GetMTCPManager(mctx); +// if (!mtcp) { +// return -1; +// } + +// if (sockid < 0 || sockid >= CONFIG.max_concurrency) { +// TRACE_API("Socket id %d out of range.\n", sockid); +// errno = EBADF; +// return -1; +// } + +// if (mtcp->smap[sockid].socktype == MTCP_SOCK_UNUSED) { +// TRACE_API("Invalid socket id: %d\n", sockid); +// errno = EBADF; +// return -1; +// } + +// if (mtcp->smap[sockid].socktype != MTCP_SOCK_STREAM) { +// TRACE_API("Not an end socket. id: %d\n", sockid); +// errno = ENOTSOCK; +// return -1; +// } + +// cur_stream = mtcp->smap[sockid].stream; +// if (!cur_stream) { +// TRACE_API("Stream %d: does not exist.\n", sockid); +// errno = ENOTCONN; +// return -1; +// } + +// TRACE_API("Socket %d: mtcp_abort()\n", sockid); + +// FreeSocket(mctx, sockid, FALSE); +// cur_stream->socket = NULL; + +// if (cur_stream->state == TCP_ST_CLOSED) { +// TRACE_API("Stream %d: connection already reset.\n", sockid); +// return ERROR; + +// } else if (cur_stream->state == TCP_ST_SYN_SENT) { +// /* TODO: this should notify event failure to all +// previous read() or write() calls */ +// cur_stream->state = TCP_ST_CLOSED; +// cur_stream->close_reason = TCP_ACTIVE_CLOSE; +// SQ_LOCK(&mtcp->ctx->destroyq_lock); +// StreamEnqueue(mtcp->destroyq, cur_stream); +// SQ_UNLOCK(&mtcp->ctx->destroyq_lock); +// mtcp->wakeup_flag = TRUE; +// return 0; + +// } else if (cur_stream->state == TCP_ST_CLOSING || cur_stream->state == TCP_ST_LAST_ACK || +// cur_stream->state == TCP_ST_TIME_WAIT) { +// cur_stream->state = TCP_ST_CLOSED; +// cur_stream->close_reason = TCP_ACTIVE_CLOSE; +// SQ_LOCK(&mtcp->ctx->destroyq_lock); +// StreamEnqueue(mtcp->destroyq, cur_stream); +// SQ_UNLOCK(&mtcp->ctx->destroyq_lock); +// mtcp->wakeup_flag = TRUE; +// return 0; +// } + +// /* the stream structure will be destroyed after sending RST */ +// if (cur_stream->sndvar->on_resetq) { +// TRACE_ERROR("Stream %d: calling mtcp_abort() " +// "when in reset queue.\n", +// sockid); +// errno = ECONNRESET; +// return -1; +// } +// SQ_LOCK(&mtcp->ctx->reset_lock); +// cur_stream->sndvar->on_resetq = TRUE; +// ret = StreamEnqueue(mtcp->resetq, cur_stream); +// SQ_UNLOCK(&mtcp->ctx->reset_lock); +// mtcp->wakeup_flag = TRUE; + +// if (ret < 0) { +// TRACE_ERROR("(NEVER HAPPEN) Failed to enqueue the stream to close.\n"); +// errno = EAGAIN; +// return -1; +// } + +// return 0; +// } +/*----------------------------------------------------------------------------*/ +static inline int PeekForUser(mtcp_manager_t mtcp, tcp_stream *cur_stream, char *buf, int len) +{ + (void)mtcp; + struct tcp_recv_vars *rcvvar = cur_stream->rcvvar; + int copylen; + + copylen = MIN(rcvvar->rcvbuf->merged_len, len); + if (copylen <= 0) { + errno = EAGAIN; + return -1; + } + + /* Only copy data to user buffer */ + memcpy(buf, rcvvar->rcvbuf->head, copylen); + + return copylen; +} +/*----------------------------------------------------------------------------*/ +static inline int CopyToUser(mtcp_manager_t mtcp, tcp_stream *cur_stream, char *buf, int len) +{ + struct tcp_recv_vars *rcvvar = cur_stream->rcvvar; + uint32_t prev_rcv_wnd; + int copylen; + + copylen = MIN(rcvvar->rcvbuf->merged_len, len); + if (copylen <= 0) { + errno = EAGAIN; + return -1; + } + + prev_rcv_wnd = rcvvar->rcv_wnd; + /* Copy data to user buffer and remove it from receiving buffer */ + memcpy(buf, rcvvar->rcvbuf->head, copylen); + RBRemove(mtcp->rbm_rcv, rcvvar->rcvbuf, copylen, AT_APP); + rcvvar->rcv_wnd = rcvvar->rcvbuf->size - rcvvar->rcvbuf->merged_len; + + /* Advertise newly freed receive buffer */ + if (cur_stream->need_wnd_adv) { + if (rcvvar->rcv_wnd > cur_stream->sndvar->eff_mss) { + if (!cur_stream->sndvar->on_ackq) { + SQ_LOCK(&mtcp->ctx->ackq_lock); + cur_stream->sndvar->on_ackq = TRUE; + StreamEnqueue(mtcp->ackq, cur_stream); /* this always success */ + SQ_UNLOCK(&mtcp->ctx->ackq_lock); + cur_stream->need_wnd_adv = FALSE; + mtcp->wakeup_flag = TRUE; + } + } + } + + UNUSED(prev_rcv_wnd); + return copylen; +} +/*----------------------------------------------------------------------------*/ +ssize_t mtcp_recv(mctx_t mctx, int sockid, char *buf, size_t len, int flags) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + tcp_stream *cur_stream; + struct tcp_recv_vars *rcvvar; + int event_remaining; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype == MTCP_SOCK_PIPE) { + return PipeRead(mctx, sockid, buf, len); + } + + if (socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Not an end socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + /* stream should be in ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT */ + cur_stream = socket->stream; + if (!cur_stream || !(cur_stream->state >= TCP_ST_ESTABLISHED && cur_stream->state <= TCP_ST_CLOSE_WAIT)) { + errno = ENOTCONN; + return -1; + } + + rcvvar = cur_stream->rcvvar; + + /* if CLOSE_WAIT, return 0 if there is no payload */ + if (cur_stream->state == TCP_ST_CLOSE_WAIT) { + if (!rcvvar->rcvbuf) + return 0; + + if (rcvvar->rcvbuf->merged_len == 0) + return 0; + } + + /* return EAGAIN if no receive buffer */ + if (socket->opts & MTCP_NONBLOCK) { + if (!rcvvar->rcvbuf || rcvvar->rcvbuf->merged_len == 0) { + errno = EAGAIN; + return -1; + } + } + + SBUF_LOCK(&rcvvar->read_lock); +#if BLOCKING_SUPPORT + if (!(socket->opts & MTCP_NONBLOCK)) { + while (rcvvar->rcvbuf->merged_len == 0) { + if (!cur_stream || cur_stream->state != TCP_ST_ESTABLISHED) { + SBUF_UNLOCK(&rcvvar->read_lock); + errno = EINTR; + return -1; + } + pthread_cond_wait(&rcvvar->read_cond, &rcvvar->read_lock); + } + } +#endif + + switch (flags) { + case 0: + ret = CopyToUser(mtcp, cur_stream, buf, len); + break; + case MSG_PEEK: + ret = PeekForUser(mtcp, cur_stream, buf, len); + break; + default: + SBUF_UNLOCK(&rcvvar->read_lock); + ret = -1; + errno = EINVAL; + return ret; + } + + event_remaining = FALSE; + /* if there are remaining payload, generate EPOLLIN */ + /* (may due to insufficient user buffer) */ + if (socket->epoll & MTCP_EPOLLIN) { + if (!(socket->epoll & MTCP_EPOLLET) && rcvvar->rcvbuf->merged_len > 0) { + event_remaining = TRUE; + } + } + /* if waiting for close, notify it if no remaining data */ + if (cur_stream->state == TCP_ST_CLOSE_WAIT && rcvvar->rcvbuf->merged_len == 0 && ret > 0) { + event_remaining = TRUE; + } + + SBUF_UNLOCK(&rcvvar->read_lock); + + if (event_remaining) { + if (socket->epoll) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); +#if BLOCKING_SUPPORT + } else if (!(socket->opts & MTCP_NONBLOCK)) { + if (!cur_stream->on_rcv_br_list) { + cur_stream->on_rcv_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, cur_stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt++; + } +#endif + } + } + + TRACE_API("Stream %d: mtcp_recv() returning %d\n", cur_stream->id, ret); + return ret; +} +/*----------------------------------------------------------------------------*/ +inline ssize_t mtcp_read(mctx_t mctx, int sockid, char *buf, size_t len) +{ + return mtcp_recv(mctx, sockid, buf, len, 0); +} +/*----------------------------------------------------------------------------*/ +int mtcp_readv(mctx_t mctx, int sockid, const struct iovec *iov, int numIOV) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + tcp_stream *cur_stream; + struct tcp_recv_vars *rcvvar; + int ret, bytes_read, i; + int event_remaining; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Not an end socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + /* stream should be in ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT */ + cur_stream = socket->stream; + if (!cur_stream || !(cur_stream->state >= TCP_ST_ESTABLISHED && cur_stream->state <= TCP_ST_CLOSE_WAIT)) { + errno = ENOTCONN; + return -1; + } + + rcvvar = cur_stream->rcvvar; + + /* if CLOSE_WAIT, return 0 if there is no payload */ + if (cur_stream->state == TCP_ST_CLOSE_WAIT) { + if (!rcvvar->rcvbuf) + return 0; + + if (rcvvar->rcvbuf->merged_len == 0) + return 0; + } + + /* return EAGAIN if no receive buffer */ + if (socket->opts & MTCP_NONBLOCK) { + if (!rcvvar->rcvbuf || rcvvar->rcvbuf->merged_len == 0) { + errno = EAGAIN; + return -1; + } + } + + SBUF_LOCK(&rcvvar->read_lock); +#if BLOCKING_SUPPORT + if (!(socket->opts & MTCP_NONBLOCK)) { + while (rcvvar->rcvbuf->merged_len == 0) { + if (!cur_stream || cur_stream->state != TCP_ST_ESTABLISHED) { + SBUF_UNLOCK(&rcvvar->read_lock); + errno = EINTR; + return -1; + } + pthread_cond_wait(&rcvvar->read_cond, &rcvvar->read_lock); + } + } +#endif + + /* read and store the contents to the vectored buffers */ + bytes_read = 0; + for (i = 0; i < numIOV; i++) { + if (iov[i].iov_len <= 0) + continue; + + ret = CopyToUser(mtcp, cur_stream, iov[i].iov_base, iov[i].iov_len); + if (ret <= 0) + break; + + bytes_read += ret; + + if (ret < (int)iov[i].iov_len) + break; + } + + event_remaining = FALSE; + /* if there are remaining payload, generate read event */ + /* (may due to insufficient user buffer) */ + if (socket->epoll & MTCP_EPOLLIN) { + if (!(socket->epoll & MTCP_EPOLLET) && rcvvar->rcvbuf->merged_len > 0) { + event_remaining = TRUE; + } + } + /* if waiting for close, notify it if no remaining data */ + if (cur_stream->state == TCP_ST_CLOSE_WAIT && rcvvar->rcvbuf->merged_len == 0 && bytes_read > 0) { + event_remaining = TRUE; + } + + SBUF_UNLOCK(&rcvvar->read_lock); + + if (event_remaining) { + if ((socket->epoll & MTCP_EPOLLIN) && !(socket->epoll & MTCP_EPOLLET)) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); +#if BLOCKING_SUPPORT + } else if (!(socket->opts & MTCP_NONBLOCK)) { + if (!cur_stream->on_rcv_br_list) { + cur_stream->on_rcv_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, cur_stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt++; + } +#endif + } + } + + TRACE_API("Stream %d: mtcp_readv() returning %d\n", cur_stream->id, bytes_read); + return bytes_read; +} +/*----------------------------------------------------------------------------*/ +static inline int CopyFromUser(mtcp_manager_t mtcp, tcp_stream *cur_stream, const char *buf, int len) +{ + struct tcp_send_vars *sndvar = cur_stream->sndvar; + int sndlen; + int ret; + + sndlen = MIN((int)sndvar->snd_wnd, len); + if (sndlen <= 0) { + errno = EAGAIN; + return -1; + } + + /* allocate send buffer if not exist */ + if (!sndvar->sndbuf) { + sndvar->sndbuf = SBInit(mtcp->rbm_snd, sndvar->iss + 1); + if (!sndvar->sndbuf) { + cur_stream->close_reason = TCP_NO_MEM; + /* notification may not required due to -1 return */ + errno = ENOMEM; + return -1; + } + } + + ret = SBPut(mtcp->rbm_snd, sndvar->sndbuf, buf, sndlen); + assert(ret == sndlen); + sndvar->snd_wnd = sndvar->sndbuf->size - sndvar->sndbuf->len; + if (ret <= 0) { + TRACE_ERROR("SBPut failed. reason: %d (sndlen: %u, len: %u\n", ret, sndlen, sndvar->sndbuf->len); + errno = EAGAIN; + return -1; + } + + if (sndvar->snd_wnd <= 0) { + TRACE_SNDBUF("%u Sending buffer became full!! snd_wnd: %u\n", cur_stream->id, sndvar->snd_wnd); + } + + return ret; +} +/*----------------------------------------------------------------------------*/ +ssize_t mtcp_write(mctx_t mctx, int sockid, const char *buf, size_t len) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + tcp_stream *cur_stream; + struct tcp_send_vars *sndvar; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype == MTCP_SOCK_PIPE) { + return PipeWrite(mctx, sockid, buf, len); + } + + if (socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Not an end socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + cur_stream = socket->stream; + if (!cur_stream || !(cur_stream->state == TCP_ST_ESTABLISHED || cur_stream->state == TCP_ST_CLOSE_WAIT)) { + errno = ENOTCONN; + return -1; + } + + if (len <= 0) { + if (socket->opts & MTCP_NONBLOCK) { + errno = EAGAIN; + return -1; + } else { + return 0; + } + } + + sndvar = cur_stream->sndvar; + + SBUF_LOCK(&sndvar->write_lock); +#if BLOCKING_SUPPORT + if (!(socket->opts & MTCP_NONBLOCK)) { + while (sndvar->snd_wnd <= 0) { + TRACE_SNDBUF("Waiting for available sending window...\n"); + if (!cur_stream || cur_stream->state != TCP_ST_ESTABLISHED) { + SBUF_UNLOCK(&sndvar->write_lock); + errno = EINTR; + return -1; + } + pthread_cond_wait(&sndvar->write_cond, &sndvar->write_lock); + TRACE_SNDBUF("Sending buffer became ready! snd_wnd: %u\n", sndvar->snd_wnd); + } + } +#endif + + ret = CopyFromUser(mtcp, cur_stream, buf, len); + + SBUF_UNLOCK(&sndvar->write_lock); + + if (ret > 0 && !(sndvar->on_sendq || sndvar->on_send_list)) { + SQ_LOCK(&mtcp->ctx->sendq_lock); + sndvar->on_sendq = TRUE; + StreamEnqueue(mtcp->sendq, cur_stream); /* this always success */ + SQ_UNLOCK(&mtcp->ctx->sendq_lock); + mtcp->wakeup_flag = TRUE; + } + + if (ret == 0 && (socket->opts & MTCP_NONBLOCK)) { + ret = -1; + errno = EAGAIN; + } + + /* if there are remaining sending buffer, generate write event */ + if (sndvar->snd_wnd > 0) { + if ((socket->epoll & MTCP_EPOLLOUT) && !(socket->epoll & MTCP_EPOLLET)) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT); +#if BLOCKING_SUPPORT + } else if (!(socket->opts & MTCP_NONBLOCK)) { + if (!cur_stream->on_snd_br_list) { + cur_stream->on_snd_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->snd_br_list, cur_stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt++; + } +#endif + } + } + + TRACE_API("Stream %d: mtcp_write() returning %d\n", cur_stream->id, ret); + return ret; +} +/*----------------------------------------------------------------------------*/ +int mtcp_writev(mctx_t mctx, int sockid, const struct iovec *iov, int numIOV) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + tcp_stream *cur_stream; + struct tcp_send_vars *sndvar; + int ret, to_write, i; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + socket = &mtcp->smap[sockid]; + if (socket->socktype == MTCP_SOCK_UNUSED) { + TRACE_API("Invalid socket id: %d\n", sockid); + errno = EBADF; + return -1; + } + + if (socket->socktype != MTCP_SOCK_STREAM) { + TRACE_API("Not an end socket. id: %d\n", sockid); + errno = ENOTSOCK; + return -1; + } + + cur_stream = socket->stream; + if (!cur_stream || !(cur_stream->state == TCP_ST_ESTABLISHED || cur_stream->state == TCP_ST_CLOSE_WAIT)) { + errno = ENOTCONN; + return -1; + } + + sndvar = cur_stream->sndvar; + SBUF_LOCK(&sndvar->write_lock); +#if BLOCKING_SUPPORT + if (!(socket->opts & MTCP_NONBLOCK)) { + while (sndvar->snd_wnd <= 0) { + TRACE_SNDBUF("Waiting for available sending window...\n"); + if (!cur_stream || cur_stream->state != TCP_ST_ESTABLISHED) { + SBUF_UNLOCK(&sndvar->write_lock); + errno = EINTR; + return -1; + } + pthread_cond_wait(&sndvar->write_cond, &sndvar->write_lock); + TRACE_SNDBUF("Sending buffer became ready! snd_wnd: %u\n", sndvar->snd_wnd); + } + } +#endif + + /* write from the vectored buffers */ + to_write = 0; + for (i = 0; i < numIOV; i++) { + if (iov[i].iov_len <= 0) + continue; + + ret = CopyFromUser(mtcp, cur_stream, iov[i].iov_base, iov[i].iov_len); + if (ret <= 0) + break; + + to_write += ret; + + if (ret < (int)iov[i].iov_len) + break; + } + SBUF_UNLOCK(&sndvar->write_lock); + + if (to_write > 0 && !(sndvar->on_sendq || sndvar->on_send_list)) { + SQ_LOCK(&mtcp->ctx->sendq_lock); + sndvar->on_sendq = TRUE; + StreamEnqueue(mtcp->sendq, cur_stream); /* this always success */ + SQ_UNLOCK(&mtcp->ctx->sendq_lock); + mtcp->wakeup_flag = TRUE; + } + + if (to_write == 0 && (socket->opts & MTCP_NONBLOCK)) { + to_write = -1; + errno = EAGAIN; + } + + /* if there are remaining sending buffer, generate write event */ + if (sndvar->snd_wnd > 0) { + if ((socket->epoll & MTCP_EPOLLOUT) && !(socket->epoll & MTCP_EPOLLET)) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT); +#if BLOCKING_SUPPORT + } else if (!(socket->opts & MTCP_NONBLOCK)) { + if (!cur_stream->on_snd_br_list) { + cur_stream->on_snd_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->snd_br_list, cur_stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt++; + } +#endif + } + } + + TRACE_API("Stream %d: mtcp_writev() returning %d\n", cur_stream->id, to_write); + return to_write; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/arp.c b/lib/flash/mtcp/arp.c new file mode 100644 index 0000000..c7cc2cb --- /dev/null +++ b/lib/flash/mtcp/arp.c @@ -0,0 +1,386 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +/* for inet_ntoa() */ +#include +#include +#include + +#include "mtcp.h" +#include "arp.h" +#include "eth_out.h" +#include "debug.h" + +#define ARP_PAD_LEN 18 /* arp pad length to fit 64B packet size */ +#define ARP_TIMEOUT_SEC 1 /* 1 second arp timeout */ + +/*----------------------------------------------------------------------------*/ +enum arp_hrd_format { arp_hrd_ethernet = 1 }; +/*----------------------------------------------------------------------------*/ +enum arp_opcode { + arp_op_request = 1, + arp_op_reply = 2, +}; +/*----------------------------------------------------------------------------*/ +struct arphdr { + uint16_t ar_hrd; /* hardware address format */ + uint16_t ar_pro; /* protocol address format */ + uint8_t ar_hln; /* hardware address length */ + uint8_t ar_pln; /* protocol address length */ + uint16_t ar_op; /* arp opcode */ + + uint8_t ar_sha[ETH_ALEN]; /* sender hardware address */ + uint32_t ar_sip; /* sender ip address */ + uint8_t ar_tha[ETH_ALEN]; /* targe hardware address */ + uint32_t ar_tip; /* target ip address */ + + uint8_t pad[ARP_PAD_LEN]; +} __attribute__((packed)); +/*----------------------------------------------------------------------------*/ +struct arp_queue_entry { + uint32_t ip; /* target ip address */ + int nif_out; /* output interface number */ + uint32_t ts_out; /* last sent timestamp */ + + TAILQ_ENTRY(arp_queue_entry) arp_link; +}; +/*----------------------------------------------------------------------------*/ +struct arp_manager { + TAILQ_HEAD(, arp_queue_entry) list; + pthread_mutex_t lock; +}; +/*----------------------------------------------------------------------------*/ +struct arp_manager g_arpm; +/*----------------------------------------------------------------------------*/ +void DumpARPPacket(mtcp_manager_t mtcp, struct arphdr *arph); +/*----------------------------------------------------------------------------*/ +int InitARPTable(void) +{ + CONFIG.arp.entries = 0; + + CONFIG.arp.entry = (struct arp_entry *)calloc(MAX_ARPENTRY, sizeof(struct arp_entry)); + if (CONFIG.arp.entry == NULL) { + perror("calloc"); + return -1; + } + + TAILQ_INIT(&g_arpm.list); + pthread_mutex_init(&g_arpm.lock, NULL); + + return 0; +} +/*----------------------------------------------------------------------------*/ +unsigned char *GetHWaddr(uint32_t ip) +{ + int i; + unsigned char *haddr = NULL; + for (i = 0; i < CONFIG.eths_num; i++) { + if (ip == CONFIG.eths[i].ip_addr) { + haddr = CONFIG.eths[i].haddr; + break; + } + } + + return haddr; +} +/*----------------------------------------------------------------------------*/ +unsigned char *GetDestinationHWaddr(uint32_t dip, uint8_t is_gateway) +{ + unsigned char *d_haddr = NULL; + int prefix = 0; + int i; + + if (is_gateway == 1 && CONFIG.arp.gateway) + d_haddr = (CONFIG.arp.gateway)->haddr; + else { + /* Longest prefix matching */ + for (i = 0; i < CONFIG.arp.entries; i++) { + if (CONFIG.arp.entry[i].prefix == 1) { + if (CONFIG.arp.entry[i].ip == dip) { + d_haddr = CONFIG.arp.entry[i].haddr; + break; + } + } else { + if ((dip & CONFIG.arp.entry[i].ip_mask) == CONFIG.arp.entry[i].ip_masked) { + if (CONFIG.arp.entry[i].prefix > prefix) { + d_haddr = CONFIG.arp.entry[i].haddr; + prefix = CONFIG.arp.entry[i].prefix; + } + } + } + } + } + + return d_haddr; +} +/*----------------------------------------------------------------------------*/ +static int ARPOutput(struct mtcp_manager *mtcp, int nif, int opcode, uint32_t dst_ip, unsigned char *dst_haddr, + unsigned char *target_haddr) +{ + if (!dst_haddr) + return -1; + + /* Allocate a buffer */ + struct arphdr *arph = (struct arphdr *)EthernetOutput(mtcp, ETH_P_ARP, nif, dst_haddr, sizeof(struct arphdr)); + if (!arph) { + return -1; + } + /* Fill arp header */ + arph->ar_hrd = htons(arp_hrd_ethernet); + arph->ar_pro = htons(ETH_P_IP); + arph->ar_hln = ETH_ALEN; + arph->ar_pln = 4; + arph->ar_op = htons(opcode); + + /* Fill arp body */ + int edix = CONFIG.nif_to_eidx[nif]; + arph->ar_sip = CONFIG.eths[edix].ip_addr; + arph->ar_tip = dst_ip; + + memcpy(arph->ar_sha, CONFIG.eths[edix].haddr, arph->ar_hln); + if (target_haddr) { + memcpy(arph->ar_tha, target_haddr, arph->ar_hln); + } else { + memcpy(arph->ar_tha, dst_haddr, arph->ar_hln); + } + memset(arph->pad, 0, ARP_PAD_LEN); + +#if defined(DBGMSG) + DumpARPPacket(mtcp, arph); +#endif + + return 0; +} +/*----------------------------------------------------------------------------*/ +static int RegisterARPEntry(uint32_t ip, const unsigned char *haddr) +{ + int idx = CONFIG.arp.entries; + + CONFIG.arp.entry[idx].prefix = 32; + CONFIG.arp.entry[idx].ip = ip; + memcpy(CONFIG.arp.entry[idx].haddr, haddr, ETH_ALEN); + CONFIG.arp.entry[idx].ip_mask = -1; + CONFIG.arp.entry[idx].ip_masked = ip; + + if (CONFIG.gateway && ((CONFIG.gateway)->daddr & CONFIG.arp.entry[idx].ip_mask) == CONFIG.arp.entry[idx].ip_masked) { + CONFIG.arp.gateway = &CONFIG.arp.entry[idx]; + TRACE_CONFIG("ARP Gateway SET!\n"); + } + + CONFIG.arp.entries = idx + 1; + + TRACE_CONFIG("Learned new arp entry.\n"); + PrintARPTable(); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void RequestARP(mtcp_manager_t mtcp, uint32_t ip, int nif, uint32_t cur_ts) +{ + struct arp_queue_entry *ent; + unsigned char haddr[ETH_ALEN]; + unsigned char taddr[ETH_ALEN]; + + pthread_mutex_lock(&g_arpm.lock); + /* if the arp request is in progress, return */ + TAILQ_FOREACH(ent, &g_arpm.list, arp_link) + { + if (ent->ip == ip) { + pthread_mutex_unlock(&g_arpm.lock); + return; + } + } + + ent = (struct arp_queue_entry *)calloc(1, sizeof(struct arp_queue_entry)); + ent->ip = ip; + ent->nif_out = nif; + ent->ts_out = cur_ts; + TAILQ_INSERT_TAIL(&g_arpm.list, ent, arp_link); + pthread_mutex_unlock(&g_arpm.lock); + + /* else, broadcast arp request */ + memset(haddr, 0xFF, ETH_ALEN); + memset(taddr, 0x00, ETH_ALEN); + ARPOutput(mtcp, nif, arp_op_request, ip, haddr, taddr); +} +/*----------------------------------------------------------------------------*/ +static int ProcessARPRequest(mtcp_manager_t mtcp, struct arphdr *arph, int nif, uint32_t cur_ts) +{ + (void)cur_ts; + unsigned char *temp; + + /* register the arp entry if not exist */ + temp = GetDestinationHWaddr(arph->ar_sip, 0); + if (!temp) { + RegisterARPEntry(arph->ar_sip, arph->ar_sha); + } + + /* send arp reply */ + ARPOutput(mtcp, nif, arp_op_reply, arph->ar_sip, arph->ar_sha, NULL); + + return 0; +} +/*----------------------------------------------------------------------------*/ +static int ProcessARPReply(mtcp_manager_t mtcp, struct arphdr *arph, uint32_t cur_ts) +{ + (void)mtcp; + (void)cur_ts; + unsigned char *temp; + struct arp_queue_entry *ent; + + /* register the arp entry if not exist */ + temp = GetDestinationHWaddr(arph->ar_sip, 0); + if (!temp) { + RegisterARPEntry(arph->ar_sip, arph->ar_sha); + } + + /* remove from the arp request queue */ + pthread_mutex_lock(&g_arpm.lock); + TAILQ_FOREACH(ent, &g_arpm.list, arp_link) + { + if (ent->ip == arph->ar_sip) { + TAILQ_REMOVE(&g_arpm.list, ent, arp_link); + free(ent); + break; + } + } + pthread_mutex_unlock(&g_arpm.lock); + + return 0; +} +/*----------------------------------------------------------------------------*/ +int ProcessARPPacket(mtcp_manager_t mtcp, uint32_t cur_ts, const int ifidx, unsigned char *pkt_data, int len) +{ + (void)len; + struct arphdr *arph = (struct arphdr *)(pkt_data + sizeof(struct ethhdr)); + int i, nif; + int to_me = FALSE; + + /* process the arp messages destined to me */ + for (i = 0; i < CONFIG.eths_num; i++) { + if (arph->ar_tip == CONFIG.eths[i].ip_addr) { + to_me = TRUE; + } + } + + if (!to_me) + return TRUE; + +#if defined(DBGMSG) + DumpARPPacket(mtcp, arph); +#endif + + switch (ntohs(arph->ar_op)) { + case arp_op_request: + nif = CONFIG.eths[ifidx].ifindex; // use the port index as argument + ProcessARPRequest(mtcp, arph, nif, cur_ts); + break; + + case arp_op_reply: + ProcessARPReply(mtcp, arph, cur_ts); + break; + + default: + break; + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +/* ARPTimer: wakes up every milisecond and check the ARP timeout */ +/* timeout is set to 1 second */ +/*----------------------------------------------------------------------------*/ +void ARPTimer(mtcp_manager_t mtcp, uint32_t cur_ts) +{ + struct arp_queue_entry *ent, *ent_tmp; + + /* if the arp requet is timed out, retransmit */ + pthread_mutex_lock(&g_arpm.lock); + TAILQ_FOREACH_SAFE(ent, &g_arpm.list, arp_link, ent_tmp) + { + if (TCP_SEQ_GT(cur_ts, ent->ts_out + SEC_TO_TS(ARP_TIMEOUT_SEC))) { + struct in_addr ina; + ina.s_addr = ent->ip; + TRACE_INFO("[CPU%2d] ARP request for %s timed out.\n", mtcp->ctx->cpu, inet_ntoa(ina)); + TAILQ_REMOVE(&g_arpm.list, ent, arp_link); + free(ent); + } + } + pthread_mutex_unlock(&g_arpm.lock); +} +/*----------------------------------------------------------------------------*/ +void PrintARPTable(void) +{ + int i; + + /* print out process start information */ + TRACE_CONFIG("ARP Table:\n"); + for (i = 0; i < CONFIG.arp.entries; i++) { + uint8_t *da = (uint8_t *)&CONFIG.arp.entry[i].ip; + + TRACE_CONFIG("IP addr: %u.%u.%u.%u, " + "dst_hwaddr: %02X:%02X:%02X:%02X:%02X:%02X\n", + da[0], da[1], da[2], da[3], CONFIG.arp.entry[i].haddr[0], CONFIG.arp.entry[i].haddr[1], + CONFIG.arp.entry[i].haddr[2], CONFIG.arp.entry[i].haddr[3], CONFIG.arp.entry[i].haddr[4], + CONFIG.arp.entry[i].haddr[5]); + } + if (CONFIG.arp.entries == 0) + TRACE_CONFIG("(blank)\n"); + + TRACE_CONFIG("----------------------------------------------------------" + "-----------------------\n"); +} +/*----------------------------------------------------------------------------*/ +void DumpARPPacket(mtcp_manager_t mtcp, struct arphdr *arph) +{ + uint8_t *t; + + thread_printf(mtcp, mtcp->log_fp, "ARP header: \n"); + thread_printf(mtcp, mtcp->log_fp, + "Hardware type: %d (len: %d), " + "protocol type: %d (len: %d), opcode: %d\n", + ntohs(arph->ar_hrd), arph->ar_hln, ntohs(arph->ar_pro), arph->ar_pln, ntohs(arph->ar_op)); + t = (uint8_t *)&arph->ar_sip; + thread_printf(mtcp, mtcp->log_fp, + "Sender IP: %u.%u.%u.%u, " + "haddr: %02X:%02X:%02X:%02X:%02X:%02X\n", + t[0], t[1], t[2], t[3], arph->ar_sha[0], arph->ar_sha[1], arph->ar_sha[2], arph->ar_sha[3], arph->ar_sha[4], + arph->ar_sha[5]); + t = (uint8_t *)&arph->ar_tip; + thread_printf(mtcp, mtcp->log_fp, + "Target IP: %u.%u.%u.%u, " + "haddr: %02X:%02X:%02X:%02X:%02X:%02X\n", + t[0], t[1], t[2], t[3], arph->ar_tha[0], arph->ar_tha[1], arph->ar_tha[2], arph->ar_tha[3], arph->ar_tha[4], + arph->ar_tha[5]); +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/ccp.c b/lib/flash/mtcp/ccp.c new file mode 100644 index 0000000..0c4bb8d --- /dev/null +++ b/lib/flash/mtcp/ccp.c @@ -0,0 +1,326 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "mtcp.h" +#include "tcp_in.h" +#include "tcp_stream.h" +#include "debug.h" +#include "clock.h" +#if USE_CCP +#include "ccp.h" +#include "libccp/ccp.h" +/*----------------------------------------------------------------------------*/ +static inline void get_stream_from_ccp(tcp_stream **stream, struct ccp_connection *conn) +{ + *stream = (tcp_stream *)ccp_get_impl(conn); +} +/*----------------------------------------------------------------------------*/ +static inline void get_mtcp_from_ccp(mtcp_manager_t *mtcp) +{ + *mtcp = (mtcp_manager_t)ccp_get_global_impl(); +} +/*----------------------------------------------------------------------------*/ +/* Function handlers passed to libccp */ +/*----------------------------------------------------------------------------*/ +static void _dp_set_cwnd(struct ccp_datapath *dp, struct ccp_connection *conn, uint32_t cwnd) +{ + tcp_stream *stream; + get_stream_from_ccp(&stream, conn); + uint32_t new_cwnd = MAX(cwnd, TCP_INIT_CWND * stream->sndvar->mss); + + // (time_ms) (rtt) (curr_cwnd_pkts) (new_cwnd_pkts) (ssthresh) + if (cwnd != stream->sndvar->cwnd) { + CCP_PROBE("%lu %d %d->%d (ss=%d)\n", USECS_TO_MS(now_usecs()), + UNSHIFT_SRTT(stream->rcvvar->srtt) stream->sndvar->cwnd / stream->sndvar->mss, + new_cwnd / stream->sndvar->mss, stream->sndvar->ssthresh / stream->sndvar->mss); + } + stream->sndvar->cwnd = new_cwnd; +} +/*----------------------------------------------------------------------------*/ +static void _dp_set_rate_abs(struct ccp_datapath *dp, struct ccp_connection *conn, uint32_t rate) +{ + tcp_stream *stream; + get_stream_from_ccp(&stream, conn); +#if PACING_ENABLED || RATE_LIMIT_ENABLED +#if RATE_LIMIT_ENABLED + stream->bucket->rate = rate; +#endif +#if PACING_ENABLED + stream->pacer->rate_bps = rate; +#endif +#else + TRACE_ERROR("unable to set rate, both PACING and RATE_LIMIT are disabled." + " Enable one to use rates.\n"); +#endif +} +/*----------------------------------------------------------------------------*/ +static void _dp_set_rate_rel(struct ccp_datapath *dp, struct ccp_connection *conn, uint32_t factor) +{ + tcp_stream *stream; + get_stream_from_ccp(&stream, conn); +#if PACING_ENABLED || RATE_LIMIT_ENABLED +#if RATE_LIMIT_ENABLED + stream->bucket->rate *= (factor / 100); +#endif +#if PACING_ENABLED + stream->pacer->rate_bps *= (factor / 100); +#endif +#else + TRACE_ERROR("unable to set rate, both PACING and RATE_LIMIT are disabled." + " Enable one to use rates.\n"); +#endif +} +/*----------------------------------------------------------------------------*/ +int _dp_send_msg(struct ccp_datapath *dp, struct ccp_connection *conn, char *msg, int msg_size) +{ + mtcp_manager_t mtcp; + get_mtcp_from_ccp(&mtcp); + + int ret = send(mtcp->to_ccp, msg, msg_size, 0); + if (ret < 0) { + TRACE_ERROR("failed to send msg to ccp: %s\n", strerror(errno)); + } + return ret; +} +/*----------------------------------------------------------------------------*/ + +/* Connect to CCP process via unix sockets */ +/*----------------------------------------------------------------------------*/ +void setup_ccp_connection(mtcp_manager_t mtcp) +{ + mtcp_thread_context_t ctx = mtcp->ctx; + // TODO do we need a socket per core? + int cpu = ctx->cpu; + //char cpu_str[2] = ""; + int recv_sock; + int path_len; + int ret; + struct sockaddr_un local; + + // Make sure unix socket path exists + ret = mkdir(CCP_UNIX_BASE, 0755); + if (ret < 0 && errno != EEXIST) { + TRACE_ERROR("Failed to create path for ccp unix socket (%d): %s\n", ret, strerror(errno)); + } + ret = mkdir(CCP_UNIX_BASE CCP_ID, 0755); + if (ret < 0 && errno != EEXIST) { + TRACE_ERROR("Failed to create path for ccp unix socket (%d): %s\n", ret, strerror(errno)); + } + if ((recv_sock = socket(AF_UNIX, SOCK_DGRAM, 0)) == -1) { + TRACE_ERROR("Failed to create unix recv socket for ccp comm\n"); + exit(EXIT_FAILURE); + } + local.sun_family = AF_UNIX; + strcpy(local.sun_path, FROM_CCP_PATH); + unlink(local.sun_path); + path_len = strlen(local.sun_path) + sizeof(local.sun_family); + if (bind(recv_sock, (struct sockaddr *)&local, path_len) == -1) { + TRACE_ERROR("(Cpu %d) failed to bind to unix://%s because %s\n", cpu, FROM_CCP_PATH, strerror(errno)); + exit(EXIT_FAILURE); + } + mtcp->from_ccp = recv_sock; + + struct ccp_datapath dp = { .set_cwnd = &_dp_set_cwnd, + .set_rate_abs = &_dp_set_rate_abs, + .set_rate_rel = &_dp_set_rate_rel, + .send_msg = &_dp_send_msg, + .now = &now_usecs, + .since_usecs = &time_since_usecs, + .after_usecs = &time_after_usecs, + .impl = mtcp }; + + if (ccp_init(&dp) < 0) { + TRACE_ERROR("Failed to initialize ccp connection map\n"); + exit(EXIT_FAILURE); + } +} +/*----------------------------------------------------------------------------*/ +void setup_ccp_send_socket(mtcp_manager_t mtcp) +{ + int send_sock; + int path_len; + struct sockaddr_un remote; + if ((send_sock = socket(AF_UNIX, SOCK_DGRAM, 0)) == -1) { + TRACE_ERROR("failed to create unix send socket for ccp comm\n"); + exit(EXIT_FAILURE); + } + remote.sun_family = AF_UNIX; + strcpy(remote.sun_path, TO_CCP_PATH); //TODO:CCP + path_len = strlen(remote.sun_path) + sizeof(remote.sun_family); + if (connect(send_sock, (struct sockaddr *)&remote, path_len) == -1) { + TRACE_ERROR("failed to connect to unix://%s because %s\n", TO_CCP_PATH, strerror(errno)); + exit(EXIT_FAILURE); + } + mtcp->to_ccp = send_sock; +} +/*----------------------------------------------------------------------------*/ +void destroy_ccp_connection(mtcp_manager_t mtcp) +{ + ccp_free(); + close(mtcp->from_ccp); + close(mtcp->to_ccp); +} +/*----------------------------------------------------------------------------*/ + +/* Should be called when a new connection is created */ +/*----------------------------------------------------------------------------*/ +void ccp_create(mtcp_manager_t mtcp, tcp_stream *stream) +{ + struct ccp_datapath_info info = { .init_cwnd = TCP_INIT_CWND, // TODO maybe multiply by mss? + .mss = stream->sndvar->mss, + .src_ip = stream->saddr, + .src_port = stream->sport, + .dst_ip = stream->daddr, + .dst_port = stream->dport, + .congAlg = "reno" }; + + stream->ccp_conn = ccp_connection_start((void *)stream, &info); + if (stream->ccp_conn == NULL) { + TRACE_ERROR("failed to initialize ccp_connection") + } else { + TRACE_CCP("ccp.create(%d)\n", dp->index); + } +} + +/* Should be called on each ACK */ +/*----------------------------------------------------------------------------*/ +uint32_t last_drop_t = 0; +void ccp_cong_control(mtcp_manager_t mtcp, tcp_stream *stream, uint32_t ack, uint64_t bytes_delivered, uint64_t packets_delivered) +{ + uint64_t rin = bytes_delivered, //* S_TO_US, // TODO:CCP divide by snd_int_us + rout = bytes_delivered; // * S_TO_US; // TODO:CCP divide by rcv_int_us + struct ccp_connection *conn = stream->ccp_conn; + struct ccp_primitives *mmt = &conn->prims; + + //log_cwnd_rtt(stream); + + mmt->bytes_acked = bytes_delivered; + mmt->packets_acked = packets_delivered; + mmt->snd_cwnd = stream->sndvar->cwnd; + mmt->rtt_sample_us = UNSHIFT_SRTT(stream->rcvvar->srtt); + mmt->bytes_in_flight = 0; // TODO + mmt->packets_in_flight = 0; // TODO + mmt->rate_outgoing = rin; + mmt->rate_incoming = rout; +#if TCP_OPT_SACK_ENABLED + mmt->bytes_misordered = stream->rcvvar->sacked_pkts * MSS; + mmt->packets_misordered = stream->rcvvar->sacked_pkts; +#endif + + /* + if (last_drop_t == 0 || _dp_since_usecs(last_drop_t) > 25000) { + mmt->lost_pkts_sample = 0; + last_drop_t = now_usecs(); + } + */ + + //fprintf(stderr, "mmt: %u %u\n", conn->prims.packets_misordered, conn->prims.lost_pkts_sample); + + if (conn != NULL) { + //fprintf(stderr, " lost_pkts=%u\n", mmt->lost_pkts_sample); + ccp_invoke(conn); + conn->prims.was_timeout = false; + conn->prims.bytes_misordered = 0; + conn->prims.packets_misordered = 0; + conn->prims.lost_pkts_sample = 0; +#if TCP_OPT_SACK_ENABLED + stream->rcvvar->sacked_pkts = 0; +#endif + } else { + TRACE_ERROR("ccp_connection not initialized\n"); + } +} + +#if TCP_OPT_SACK_ENABLED +uint32_t window_edge_at_last_loss = 0; +uint32_t last_loss = 0; +#endif +uint32_t last_tri_dupack_seq = 0; + +/* Should be called for any other connection event other than ACK */ +/*----------------------------------------------------------------------------*/ +void ccp_record_event(mtcp_manager_t mtcp, tcp_stream *stream, uint8_t event_type, uint32_t val) +{ +#ifdef DBGCCP + unsigned long now = (unsigned long)(now_usecs()); +#endif + int i; + + switch (event_type) { + case EVENT_DUPACK: +#if TCP_OPT_SACK_ENABLED +#else + // use num dupacks as a proxy for sacked + stream->ccp_conn->prims.bytes_misordered += val; + stream->ccp_conn->prims.packets_misordered++; +#endif + break; + case EVENT_TRI_DUPACK: +#if TCP_OPT_SACK_ENABLED + if (val > window_edge_at_last_loss) { + TRACE_CCP("%lu tridup ack=%u\n", now / 1000, val - stream->sndvar->iss); + for (i = 0; i < MAX_SACK_ENTRY; i++) { + window_edge_at_last_loss = MAX(window_edge_at_last_loss, stream->rcvvar->sack_table[i].right_edge); + } + last_tri_dupack_seq = val; + last_loss = now_usecs(); + stream->ccp_conn->prims.lost_pkts_sample++; + } +#else + // only count as a loss if we haven't already seen 3 dupacks for + // this seq number + if (last_tri_dupack_seq != val) { + TRACE_CCP("%lu tridup ack=%d\n", now / 1000, + val // - stream->sndvar->iss + ); + stream->ccp_conn->prims.lost_pkts_sample++; + last_tri_dupack_seq = val; + } +#endif + break; + case EVENT_TIMEOUT: + //stream->ccp_conn->prims.was_timeout = true; + break; + case EVENT_ECN: + TRACE_ERROR("ecn is not currently supported!\n"); + break; + default: + TRACE_ERROR("unknown record event type %d!\n", event_type); + break; + } +} +/*----------------------------------------------------------------------------*/ +#endif diff --git a/lib/flash/mtcp/clock.c b/lib/flash/mtcp/clock.c new file mode 100644 index 0000000..666c271 --- /dev/null +++ b/lib/flash/mtcp/clock.c @@ -0,0 +1,81 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "clock.h" +/*----------------------------------------------------------------------------*/ +uint64_t init_time_ns = 0; +uint32_t last_print = 0; +/*----------------------------------------------------------------------------*/ +uint64_t now_usecs() +{ + struct timespec now; + uint64_t now_ns, now_us; + + clock_gettime(CLOCK_MONOTONIC, &now); + + now_ns = (1000000000L * now.tv_sec) + now.tv_nsec; + if (init_time_ns == 0) { + init_time_ns = now_ns; + } + + now_us = ((now_ns - init_time_ns) / 1000) & 0xffffffff; + return now_us; +} +/*----------------------------------------------------------------------------*/ +uint64_t time_since_usecs(uint64_t then) +{ + return now_usecs() - then; +} +/*----------------------------------------------------------------------------*/ +uint64_t time_after_usecs(uint64_t usecs) +{ + return now_usecs() + usecs; +} +/*----------------------------------------------------------------------------*/ +#define SAMPLE_FREQ_US 10000 + +void log_cwnd_rtt(void *vs) +{ + tcp_stream *stream = (tcp_stream *)vs; + unsigned long now = (unsigned long)(now_usecs()); + if (time_since_usecs(last_print) > SAMPLE_FREQ_US) { + fprintf(stderr, "%lu %d %d/%d\n", now / 1000, stream->rcvvar->srtt * 125, stream->sndvar->cwnd / stream->sndvar->mss, + stream->sndvar->peer_wnd / stream->sndvar->mss); +#if RATE_LIMIT_ENABLED + PrintBucket(stream->bucket); +#endif +#if PACING_ENABLED + PrintPacer(stream->pacer); +#endif + last_print = now; + } +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/config.c b/lib/flash/mtcp/config.c new file mode 100644 index 0000000..54693dc --- /dev/null +++ b/lib/flash/mtcp/config.c @@ -0,0 +1,770 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mtcp.h" +#include "config.h" +#include "tcp_in.h" +#include "arp.h" +#include "debug.h" +/* for setting up io modules */ +#include "io_module.h" +/* for if_nametoindex */ +#include + +#define MAX_ROUTE_ENTRY 64 +#define MAX_OPTLINE_LEN 1024 +#define ALL_STRING "all" + +static const char *route_file = "config/route.conf"; +static const char *arp_file = "config/arp.conf"; +struct mtcp_manager *g_mtcp[MAX_CPUS] = { NULL }; +struct mtcp_config CONFIG = { + /* set default configuration */ + .max_concurrency = 10000, + .max_num_buffers = 10000, + .rcvbuf_size = -1, + .sndbuf_size = -1, + .tcp_timeout = TCP_TIMEOUT, + .tcp_timewait = TCP_TIMEWAIT, + .num_mem_ch = 0, +#if USE_CCP + .cc = "reno\n", +#endif +#ifdef ENABLE_ONVM + .onvm_inst = (uint16_t)-1, + .onvm_dest = (uint16_t)-1, + .onvm_serv = (uint16_t)-1 +#endif +}; +addr_pool_t ap[ETH_NUM] = { NULL }; +static char port_list[MAX_OPTLINE_LEN] = ""; +static char port_stat_list[MAX_OPTLINE_LEN] = ""; +/* total cpus detected in the mTCP stack*/ +int num_cpus; +/* this should be equal to num_cpus */ +int num_queues; +int num_devices; + +int num_devices_attached; +int devices_attached[MAX_DEVICES]; +/*----------------------------------------------------------------------------*/ +static inline int mystrtol(const char *nptr, int base) +{ + (void)base; + int rval; + char *endptr; + + errno = 0; + rval = strtol(nptr, &endptr, 10); + /* check for strtol errors */ + if ((errno == ERANGE && (rval == INT_MAX || rval == INT_MIN)) || (errno != 0 && rval == 0)) { + perror("strtol"); + exit(EXIT_FAILURE); + } + if (endptr == nptr) { + TRACE_CONFIG("Parsing strtol error!\n"); + exit(EXIT_FAILURE); + } + + return rval; +} +/*----------------------------------------------------------------------------*/ +static int GetIntValue(char *value) +{ + int ret = 0; + ret = strtol(value, (char **)NULL, 10); + if (errno == EINVAL || errno == ERANGE) + return -1; + return ret; +} +/*----------------------------------------------------------------------------*/ +inline uint32_t MaskFromPrefix(int prefix) +{ + uint32_t mask = 0; + uint8_t *mask_t = (uint8_t *)&mask; + int i, j; + + for (i = 0; i <= prefix / 8 && i < 4; i++) { + for (j = 0; j < (prefix - i * 8) && j < 8; j++) { + mask_t[i] |= (1 << (7 - j)); + } + } + + return mask; +} +/*----------------------------------------------------------------------------*/ +static void EnrollRouteTableEntry(char *optstr) +{ + char *daddr_s; + char *prefix; +#ifdef DISABLE_AFXDP + char *dev; + int i; +#endif + int ifidx; + int ridx; + char *saveptr; + + saveptr = NULL; + daddr_s = strtok_r(optstr, "/", &saveptr); + prefix = strtok_r(NULL, " ", &saveptr); +#ifdef DISABLE_AFXDP + dev = strtok_r(NULL, "\n", &saveptr); +#endif + assert(daddr_s != NULL); + assert(prefix != NULL); +#ifdef DISABLE_AFXDP + assert(dev != NULL); +#endif + + ifidx = -1; + if (current_iomodule_func == &ps_module_func) { +#ifndef DISABLE_PSIO + for (i = 0; i < num_devices; i++) { + if (strcmp(dev, devices[i].name) != 0) + continue; + + ifidx = devices[i].ifindex; + break; + } + if (ifidx == -1) { + TRACE_CONFIG("Interface %s does not exist!\n", dev); + exit(4); + } +#endif + } else if (current_iomodule_func == &dpdk_module_func || current_iomodule_func == &onvm_module_func) { +#ifndef DISABLE_DPDK + for (i = 0; i < num_devices; i++) { + if (strcmp(CONFIG.eths[i].dev_name, dev)) + continue; + ifidx = CONFIG.eths[i].ifindex; + break; + } +#endif + } + + ridx = CONFIG.routes++; + if (ridx == MAX_ROUTE_ENTRY) { + TRACE_CONFIG("Maximum routing entry limit (%d) has been reached." + "Consider increasing MAX_ROUTE_ENTRY.\n", + MAX_ROUTE_ENTRY); + exit(4); + } + + CONFIG.rtable[ridx].daddr = inet_addr(daddr_s); + CONFIG.rtable[ridx].prefix = mystrtol(prefix, 10); + if (CONFIG.rtable[ridx].prefix > 32 || CONFIG.rtable[ridx].prefix < 0) { + TRACE_CONFIG("Prefix length should be between 0 - 32.\n"); + exit(4); + } + + CONFIG.rtable[ridx].mask = MaskFromPrefix(CONFIG.rtable[ridx].prefix); + CONFIG.rtable[ridx].masked = CONFIG.rtable[ridx].daddr & CONFIG.rtable[ridx].mask; + CONFIG.rtable[ridx].nif = ifidx; + + if (CONFIG.rtable[ridx].mask == 0) { + TRACE_CONFIG("Default Route GW set!\n"); + CONFIG.gateway = &CONFIG.rtable[ridx]; + } +} +/*----------------------------------------------------------------------------*/ +static int SetRoutingTableFromFile(void) +{ +#define ROUTES "ROUTES" + + FILE *fc; + char optstr[MAX_OPTLINE_LEN]; + int i; + + TRACE_CONFIG("Loading routing configurations from : %s\n", route_file); + + fc = fopen(route_file, "r"); + if (fc == NULL) { + perror("fopen"); + TRACE_CONFIG("Skip loading static routing table\n"); + return -1; + } + + while (1) { + char *iscomment; + int num; + + if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL) + break; + + //skip comment + iscomment = strchr(optstr, '#'); + if (iscomment == optstr) + continue; + if (iscomment != NULL) + *iscomment = 0; + + if (!strncmp(optstr, ROUTES, sizeof(ROUTES) - 1)) { + num = GetIntValue(optstr + sizeof(ROUTES)); + if (num <= 0) + break; + + for (i = 0; i < num; i++) { + if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL) + break; + + if (*optstr == '#') { + i -= 1; + continue; + } + if (!CONFIG.gateway) + EnrollRouteTableEntry(optstr); + else { + TRACE_ERROR("Default gateway settings in %s should " + "always come as last entry!\n", + route_file); + exit(EXIT_FAILURE); + } + } + } + } + + fclose(fc); + return 0; +} +/*----------------------------------------------------------------------------*/ +void PrintRoutingTable(void) +{ + int i; + uint8_t *da; + uint8_t *m; + uint8_t *md; + + /* print out process start information */ + TRACE_CONFIG("Routes:\n"); + for (i = 0; i < CONFIG.routes; i++) { + da = (uint8_t *)&CONFIG.rtable[i].daddr; + m = (uint8_t *)&CONFIG.rtable[i].mask; + md = (uint8_t *)&CONFIG.rtable[i].masked; + TRACE_CONFIG("Destination: %u.%u.%u.%u/%d, Mask: %u.%u.%u.%u, " + "Masked: %u.%u.%u.%u, Route: ifdx-%d\n", + da[0], da[1], da[2], da[3], CONFIG.rtable[i].prefix, m[0], m[1], m[2], m[3], md[0], md[1], md[2], md[3], + CONFIG.rtable[i].nif); + } + if (CONFIG.routes == 0) + TRACE_CONFIG("(blank)\n"); + + TRACE_CONFIG("----------------------------------------------------------" + "-----------------------\n"); +} +/*----------------------------------------------------------------------------*/ +void ParseMACAddress(unsigned char *haddr, char *haddr_str) +{ + int i; + char *str; + unsigned int temp; + char *saveptr = NULL; + + saveptr = NULL; + str = strtok_r(haddr_str, ":", &saveptr); + i = 0; + while (str != NULL) { + if (i >= ETH_ALEN) { + TRACE_CONFIG("MAC address length exceeds %d!\n", ETH_ALEN); + exit(4); + } + if (sscanf(str, "%x", &temp) < 1) { + TRACE_CONFIG("sscanf failed!\n"); + exit(4); + } + haddr[i++] = temp; + str = strtok_r(NULL, ":", &saveptr); + } + if (i < ETH_ALEN) { + TRACE_CONFIG("MAC address length is less than %d!\n", ETH_ALEN); + exit(4); + } +} +/*----------------------------------------------------------------------------*/ +int ParseIPAddress(uint32_t *ip_addr, char *ip_str) +{ + if (ip_str == NULL) { + *ip_addr = 0; + return -1; + } + + *ip_addr = inet_addr(ip_str); + if (*ip_addr == INADDR_NONE) { + TRACE_CONFIG("IP address is not valid %s\n", ip_str); + *ip_addr = 0; + return -1; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int SetRoutingTable(void) +{ + int i, ridx; + unsigned int c; + + CONFIG.routes = 0; + CONFIG.rtable = (struct route_table *)calloc(MAX_ROUTE_ENTRY, sizeof(struct route_table)); + if (!CONFIG.rtable) + exit(EXIT_FAILURE); + + /* set default routing table */ + for (i = 0; i < CONFIG.eths_num; i++) { + ridx = CONFIG.routes++; + CONFIG.rtable[ridx].daddr = CONFIG.eths[i].ip_addr & CONFIG.eths[i].netmask; + + CONFIG.rtable[ridx].prefix = 0; + c = CONFIG.eths[i].netmask; + while ((c = (c >> 1))) { + CONFIG.rtable[ridx].prefix++; + } + CONFIG.rtable[ridx].prefix++; + + CONFIG.rtable[ridx].mask = CONFIG.eths[i].netmask; + CONFIG.rtable[ridx].masked = CONFIG.rtable[ridx].daddr; + CONFIG.rtable[ridx].nif = CONFIG.eths[ridx].ifindex; + } + + /* set additional routing table */ + SetRoutingTableFromFile(); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void PrintInterfaceInfo(void) +{ + int i; + + /* print out process start information */ + TRACE_CONFIG("Interfaces:\n"); + for (i = 0; i < CONFIG.eths_num; i++) { + uint8_t *da = (uint8_t *)&CONFIG.eths[i].ip_addr; + uint8_t *nm = (uint8_t *)&CONFIG.eths[i].netmask; + + TRACE_CONFIG("name: %s, ifindex: %d, " + "hwaddr: %02X:%02X:%02X:%02X:%02X:%02X, " + "ipaddr: %u.%u.%u.%u, " + "netmask: %u.%u.%u.%u\n", + CONFIG.eths[i].dev_name, CONFIG.eths[i].ifindex, CONFIG.eths[i].haddr[0], CONFIG.eths[i].haddr[1], + CONFIG.eths[i].haddr[2], CONFIG.eths[i].haddr[3], CONFIG.eths[i].haddr[4], CONFIG.eths[i].haddr[5], da[0], + da[1], da[2], da[3], nm[0], nm[1], nm[2], nm[3]); + } + TRACE_CONFIG("Number of NIC queues: %d\n", num_queues); + TRACE_CONFIG("----------------------------------------------------------" + "-----------------------\n"); +} +/*----------------------------------------------------------------------------*/ +static void EnrollARPTableEntry(char *optstr) +{ + char *dip_s; /* destination IP string */ + char *prefix_s; /* IP prefix string */ + char *daddr_s; /* destination MAC string */ + + int prefix; + uint32_t dip_mask; + int idx; + + char *saveptr; + + saveptr = NULL; + dip_s = strtok_r(optstr, "/", &saveptr); + prefix_s = strtok_r(NULL, " ", &saveptr); + daddr_s = strtok_r(NULL, "\n", &saveptr); + + assert(dip_s != NULL); + assert(prefix_s != NULL); + assert(daddr_s != NULL); + + if (prefix_s == NULL) + prefix = 32; + else + prefix = mystrtol(prefix_s, 10); + + if (prefix > 32 || prefix < 0) { + TRACE_CONFIG("Prefix length should be between 0 - 32.\n"); + return; + } + + idx = CONFIG.arp.entries++; + + CONFIG.arp.entry[idx].prefix = prefix; + ParseIPAddress(&CONFIG.arp.entry[idx].ip, dip_s); + ParseMACAddress(CONFIG.arp.entry[idx].haddr, daddr_s); + + dip_mask = MaskFromPrefix(prefix); + CONFIG.arp.entry[idx].ip_mask = dip_mask; + CONFIG.arp.entry[idx].ip_masked = CONFIG.arp.entry[idx].ip & dip_mask; + if (CONFIG.gateway && ((CONFIG.gateway)->daddr & CONFIG.arp.entry[idx].ip_mask) == CONFIG.arp.entry[idx].ip_masked) { + CONFIG.arp.gateway = &CONFIG.arp.entry[idx]; + TRACE_CONFIG("ARP Gateway SET!\n"); + } + + /* + int i, cnt; + cnt = 1; + cnt = cnt << (32 - prefix); + + for (i = 0; i < cnt; i++) { + idx = CONFIG.arp.entries++; + CONFIG.arp.entry[idx].ip = htonl(ntohl(ip) + i); + memcpy(CONFIG.arp.entry[idx].haddr, haddr, ETH_ALEN); + } +*/ +} +/*----------------------------------------------------------------------------*/ +int LoadARPTable(void) +{ +#define ARP_ENTRY "ARP_ENTRY" + + FILE *fc; + char optstr[MAX_OPTLINE_LEN]; + int numEntry = 0; + int hasNumEntry = 0; + + TRACE_CONFIG("Loading ARP table from : %s\n", arp_file); + + InitARPTable(); + + fc = fopen(arp_file, "r"); + if (fc == NULL) { + perror("fopen"); + TRACE_CONFIG("Skip loading static ARP table\n"); + return -1; + } + + while (1) { + char *p; + char *temp; + + if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL) + break; + + p = optstr; + + // skip comment + if ((temp = strchr(p, '#')) != NULL) + *temp = 0; + // remove front and tailing spaces + while (*p && isspace((int)*p)) + p++; + temp = p + strlen(p) - 1; + while (temp >= p && isspace((int)*temp)) + *temp = 0; + if (*p == 0) /* nothing more to process? */ + continue; + + if (!hasNumEntry && strncmp(p, ARP_ENTRY, sizeof(ARP_ENTRY) - 1) == 0) { + numEntry = GetIntValue(p + sizeof(ARP_ENTRY)); + if (numEntry <= 0) { + fprintf(stderr, "Wrong entry in arp.conf: %s\n", p); + exit(EXIT_FAILURE); + } +#if 0 + CONFIG.arp.entry = (struct arp_entry *) + calloc(numEntry + MAX_ARPENTRY, sizeof(struct arp_entry)); + if (CONFIG.arp.entry == NULL) { + fprintf(stderr, "Wrong entry in arp.conf: %s\n", p); + exit(EXIT_FAILURE); + } +#endif + hasNumEntry = 1; + } else { + if (numEntry <= 0) { + fprintf(stderr, + "Error in arp.conf: more entries than " + "are specifed, entry=%s\n", + p); + exit(EXIT_FAILURE); + } + EnrollARPTableEntry(p); + numEntry--; + } + } + + fclose(fc); + return 0; +} +/*----------------------------------------------------------------------------*/ +static int SetMultiProcessSupport(char *multiprocess_details) +{ + const char *token = " ="; + char *sample; + char *saveptr; + + saveptr = NULL; + sample = strtok_r(multiprocess_details, token, &saveptr); + if (sample == NULL) { + TRACE_CONFIG("No option for multi-process support given!\n"); + return -1; + } + CONFIG.multi_process = mystrtol(sample, 10); + TRACE_CONFIG("Loading multi-process configuration: %d\n", CONFIG.multi_process); + return 0; +} +/*----------------------------------------------------------------------------*/ +static inline void SaveInterfaceInfo(char *dev_name_list) +{ + strcpy(port_list, dev_name_list); +} +/*----------------------------------------------------------------------------*/ +static inline void SaveInterfaceStatList(char *dev_name_list) +{ + strcpy(port_stat_list, dev_name_list); +} +/*----------------------------------------------------------------------------*/ +static int ParseConfiguration(char *line) +{ + char optstr[MAX_OPTLINE_LEN]; + char *p, *q; + + char *saveptr; + + strncpy(optstr, line, MAX_OPTLINE_LEN - 1); + optstr[MAX_OPTLINE_LEN - 1] = '\0'; + saveptr = NULL; + + p = strtok_r(optstr, " \t=", &saveptr); + if (p == NULL) { + TRACE_CONFIG("No option name found for the line: %s\n", line); + return -1; + } + + q = strtok_r(NULL, " \t=", &saveptr); + if (q == NULL) { + TRACE_CONFIG("No option value found for the line: %s\n", line); + return -1; + } + + if (strcmp(p, "num_cores") == 0) { + CONFIG.num_cores = mystrtol(q, 10); + if (CONFIG.num_cores <= 0) { + TRACE_CONFIG("Number of cores should be larger than 0.\n"); + return -1; + } + if (CONFIG.num_cores > num_cpus) { + TRACE_CONFIG("Number of cores should be smaller than " + "# physical CPU cores.\n"); + return -1; + } + num_cpus = CONFIG.num_cores; + } else if (strcmp(p, "core_mask") == 0) { +#ifndef DISABLE_DPDK + mpz_set_str(CONFIG._cpumask, q, 16); +#endif + } else if (strcmp(p, "max_concurrency") == 0) { + CONFIG.max_concurrency = mystrtol(q, 10); + if (CONFIG.max_concurrency < 0) { + TRACE_CONFIG("The maximum concurrency should be larger than 0.\n"); + return -1; + } + } else if (strcmp(p, "max_num_buffers") == 0) { + CONFIG.max_num_buffers = mystrtol(q, 10); + if (CONFIG.max_num_buffers < 0) { + TRACE_CONFIG("The maximum # buffers should be larger than 0.\n"); + return -1; + } + } else if (strcmp(p, "rcvbuf") == 0) { + CONFIG.rcvbuf_size = mystrtol(q, 10); + if (CONFIG.rcvbuf_size < 64) { + TRACE_CONFIG("Receive buffer size should be larger than 64.\n"); + return -1; + } + } else if (strcmp(p, "sndbuf") == 0) { + CONFIG.sndbuf_size = mystrtol(q, 10); + if (CONFIG.sndbuf_size < 64) { + TRACE_CONFIG("Send buffer size should be larger than 64.\n"); + return -1; + } + } else if (strcmp(p, "tcp_timeout") == 0) { + CONFIG.tcp_timeout = mystrtol(q, 10); + if (CONFIG.tcp_timeout > 0) { + CONFIG.tcp_timeout = SEC_TO_USEC(CONFIG.tcp_timeout) / TIME_TICK; + } + } else if (strcmp(p, "tcp_timewait") == 0) { + CONFIG.tcp_timewait = mystrtol(q, 10); + if (CONFIG.tcp_timewait > 0) { + CONFIG.tcp_timewait = SEC_TO_USEC(CONFIG.tcp_timewait) / TIME_TICK; + } + } else if (strcmp(p, "stat_print") == 0) { + SaveInterfaceStatList(line + strlen(p) + 1); + } else if (strcmp(p, "port") == 0) { + if (strncmp(q, ALL_STRING, sizeof(ALL_STRING)) == 0) + SaveInterfaceInfo(q); + else + SaveInterfaceInfo(line + strlen(p) + 1); + } else if (strcmp(p, "io") == 0) { + AssignIOModule(q); + if (CheckIOModuleAccessPermissions() == -1) { + TRACE_CONFIG("[CAUTION] Run the app as root!\n"); + exit(EXIT_FAILURE); + } + } else if (strcmp(p, "num_mem_ch") == 0) { + CONFIG.num_mem_ch = mystrtol(q, 10); +#ifdef ENABLE_ONVM + } else if (strcmp(p, "onvm_inst") == 0) { + CONFIG.onvm_inst = mystrtol(q, 10); + } else if (strcmp(p, "onvm_serv") == 0) { + CONFIG.onvm_serv = mystrtol(q, 10); + } else if (strcmp(p, "onvm_dest") == 0) { + CONFIG.onvm_dest = mystrtol(q, 10); +#endif + } else if (strcmp(p, "multiprocess") == 0) { + SetMultiProcessSupport(line + strlen(p) + 1); + } else if (strcmp(p, "cc") == 0) { +#if USE_CCP + // ignore the parsing done by the second strtok_r so that we can get the full param string + *strchr(q, '\0') = ' '; + strcpy(CONFIG.cc, q); +#else + TRACE_CONFIG("[WARNING] 'cc' option provided, but CCP not enabled. define USE_CCP!\n"); + exit(EXIT_FAILURE); +#endif + + } else { + TRACE_CONFIG("Unknown option type: %s\n", line); + return -1; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int LoadConfiguration(const char *fname) +{ + FILE *fp; + char optstr[MAX_OPTLINE_LEN]; + + TRACE_CONFIG("----------------------------------------------------------" + "-----------------------\n"); + TRACE_CONFIG("Loading mtcp configuration from : %s\n", fname); + + fp = fopen(fname, "r"); + if (fp == NULL) { + perror("fopen"); + TRACE_CONFIG("Failed to load configuration file: %s\n", fname); + return -1; + } + +#ifndef DISABLE_DPDK + mpz_init(CONFIG._cpumask); +#endif + while (1) { + char *p; + char *temp; + + if (fgets(optstr, MAX_OPTLINE_LEN, fp) == NULL) + break; + + p = optstr; + + // skip comment + if ((temp = strchr(p, '#')) != NULL) + *temp = 0; + // remove front and tailing spaces + while (*p && isspace((int)*p)) + p++; + temp = p + strlen(p) - 1; + while (temp >= p && isspace((int)*temp)) + *temp = 0; + if (*p == 0) /* nothing more to process? */ + continue; + + if (ParseConfiguration(p) < 0) { + fclose(fp); + return -1; + } + } + + fclose(fp); + + /* if rcvbuf is set but sndbuf is not, sndbuf = rcvbuf */ + if (CONFIG.sndbuf_size == -1 && CONFIG.rcvbuf_size != -1) + CONFIG.sndbuf_size = CONFIG.rcvbuf_size; + /* if sndbuf is set but rcvbuf is not, rcvbuf = sndbuf */ + if (CONFIG.rcvbuf_size == -1 && CONFIG.sndbuf_size != -1) + CONFIG.rcvbuf_size = CONFIG.sndbuf_size; + /* if sndbuf & rcvbuf are not set, rcvbuf = sndbuf = 8192 */ + if (CONFIG.rcvbuf_size == -1 && CONFIG.sndbuf_size == -1) + CONFIG.sndbuf_size = CONFIG.rcvbuf_size = 8192; + + return SetNetEnv(port_list, port_stat_list); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void PrintConfiguration(void) +{ + int i; + + TRACE_CONFIG("Configurations:\n"); + TRACE_CONFIG("Number of CPU cores available: %d\n", num_cpus); + TRACE_CONFIG("Number of CPU cores to use: %d\n", CONFIG.num_cores); + TRACE_CONFIG("Maximum number of concurrency per core: %d\n", CONFIG.max_concurrency); + if (CONFIG.multi_process == 1) { + TRACE_CONFIG("Multi-process support is enabled\n"); + if (CONFIG.multi_process_is_master == 1) + TRACE_CONFIG("Current core is master (for multi-process)\n"); + else + TRACE_CONFIG("Current core is not master (for multi-process)\n"); + } + TRACE_CONFIG("Maximum number of preallocated buffers per core: %d\n", CONFIG.max_num_buffers); + TRACE_CONFIG("Receive buffer size: %d\n", CONFIG.rcvbuf_size); + TRACE_CONFIG("Send buffer size: %d\n", CONFIG.sndbuf_size); + + if (CONFIG.tcp_timeout > 0) { + TRACE_CONFIG("TCP timeout seconds: %d\n", USEC_TO_SEC(CONFIG.tcp_timeout * TIME_TICK)); + } else { + TRACE_CONFIG("TCP timeout check disabled.\n"); + } + TRACE_CONFIG("TCP timewait seconds: %d\n", USEC_TO_SEC(CONFIG.tcp_timewait * TIME_TICK)); + TRACE_CONFIG("NICs to print statistics:"); + for (i = 0; i < CONFIG.eths_num; i++) { + if (CONFIG.eths[i].stat_print) { + TRACE_CONFIG(" %s", CONFIG.eths[i].dev_name); + } + } + TRACE_CONFIG("\n"); + TRACE_CONFIG("----------------------------------------------------------" + "-----------------------\n"); +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/core.c b/lib/flash/mtcp/core.c new file mode 100644 index 0000000..9c66bae --- /dev/null +++ b/lib/flash/mtcp/core.c @@ -0,0 +1,1657 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" +#include "ps.h" +#include "eth_in.h" +#include "fhash.h" +#include "tcp_send_buffer.h" +#include "tcp_ring_buffer.h" +#include "socket.h" +#include "eth_out.h" +#include "tcp_in.h" +#include "tcp_out.h" +#include "mtcp_api.h" +#include "eventpoll.h" +#include "logger.h" +#include "config.h" +#include "arp.h" +#include "ip_out.h" +#include "timer.h" +#include "debug.h" +#if USE_CCP +#include "ccp.h" +#include "libccp/ccp.h" +#endif + +#ifndef DISABLE_DPDK +/* for launching rte thread */ +#include +#include +#endif + +#ifdef ENABLE_ONVM +#include "onvm_nflib.h" +#endif + +#define PS_CHUNK_SIZE 64 +#define RX_THRESH (PS_CHUNK_SIZE * 0.8) + +#define ROUND_STAT FALSE +#define EVENT_STAT FALSE +#define TESTING FALSE + +#define LOG_FILE_NAME "log" +#define MAX_FILE_NAME 1024 + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define PER_STREAM_SLICE 0.1 // in ms +#define PER_STREAM_TCHECK 1 // in ms +#define PS_SELECT_TIMEOUT 100 // in us + +#define GBPS(bytes) (bytes * 8.0 / (1000 * 1000 * 1000)) + +/*----------------------------------------------------------------------------*/ +/* handlers for threads */ +struct mtcp_thread_context *g_pctx[MAX_CPUS] = { 0 }; +struct log_thread_context *g_logctx[MAX_CPUS] = { 0 }; +/*----------------------------------------------------------------------------*/ +static pthread_t g_thread[MAX_CPUS] = { 0 }; +#if defined(PKTDUMP) || defined(DBGMSG) || defined(DBGFUNC) || defined(STREAM) || defined(STATE) || defined(STAT) || defined(APP) || \ + defined(EPOLL) || defined(DUMP_STREAM) +static pthread_t log_thread[MAX_CPUS] = { 0 }; +#endif +#if USE_CCP +static pthread_t ccp_run_thread = 0; +static pthread_t ccp_recv_thread[MAX_CPUS] = { 0 }; +#endif +/*----------------------------------------------------------------------------*/ +static sem_t g_init_sem[MAX_CPUS]; +static int running[MAX_CPUS] = { 0 }; +/*----------------------------------------------------------------------------*/ +mtcp_sighandler_t app_signal_handler; +static int sigint_cnt[MAX_CPUS] = { 0 }; +static struct timespec sigint_ts[MAX_CPUS]; +/*----------------------------------------------------------------------------*/ +static int mtcp_master = -1; +void mtcp_free_context(mctx_t mctx); +/*----------------------------------------------------------------------------*/ +static void HandleSignal(int signal) +{ + int i = 0; + + if (signal == SIGINT) { + int core; + struct timespec cur_ts; + +#ifdef ENABLE_ONVM + if (current_iomodule_func == &onvm_module_func) + onvm_nflib_stop(CONFIG.nf_local_ctx); +#endif + core = sched_getcpu(); + clock_gettime(CLOCK_REALTIME, &cur_ts); + + if (CONFIG.multi_process) { + for (i = 0; i < num_cpus; i++) + if (running[i] == TRUE) + g_pctx[i]->exit = TRUE; + } else { + if (sigint_cnt[core] > 0 && cur_ts.tv_sec > sigint_ts[core].tv_sec) { + for (i = 0; i < num_cpus; i++) { + if (running[i]) { + g_pctx[i]->exit = TRUE; + } + } + } else { + for (i = 0; i < num_cpus; i++) { + if (running[i]) + g_pctx[i]->interrupt = TRUE; + } + if (!app_signal_handler) { + for (i = 0; i < num_cpus; i++) { + if (running[i]) { + g_pctx[i]->exit = TRUE; + } + } + } + } + sigint_cnt[core]++; + clock_gettime(CLOCK_REALTIME, &sigint_ts[core]); + } + } + + if (signal != SIGUSR1) { + if (app_signal_handler) { + app_signal_handler(signal); + } + } +} +/*----------------------------------------------------------------------------*/ +static int AttachDevice(struct mtcp_thread_context *ctx) +{ + int working = -1; + mtcp_manager_t mtcp = ctx->mtcp_manager; + + working = mtcp->iom->link_devices(ctx); + + return working; +} +/*----------------------------------------------------------------------------*/ +#ifdef NETSTAT +static inline void InitStatCounter(struct stat_counter *counter) +{ + counter->cnt = 0; + counter->sum = 0; + counter->max = 0; + counter->min = 0; +} +/*----------------------------------------------------------------------------*/ +static inline void UpdateStatCounter(struct stat_counter *counter, uint64_t value) +{ + counter->cnt++; + counter->sum += value; + if (value > counter->max) + counter->max = value; + if (counter->min == 0 || value < counter->min) + counter->min = value; +} +/*----------------------------------------------------------------------------*/ +static inline uint64_t GetAverageStat(struct stat_counter *counter) +{ + return counter->cnt ? (counter->sum / counter->cnt) : 0; +} +/*----------------------------------------------------------------------------*/ +static inline int64_t TimeDiffUs(struct timeval *t2, struct timeval *t1) +{ + return (t2->tv_sec - t1->tv_sec) * 1000000 + (int64_t)(t2->tv_usec - t1->tv_usec); +} +/*----------------------------------------------------------------------------*/ +static inline void PrintThreadNetworkStats(mtcp_manager_t mtcp, struct net_stat *ns) +{ + int i; + + for (i = 0; i < CONFIG.eths_num; i++) { + ns->rx_packets[i] = mtcp->nstat.rx_packets[i] - mtcp->p_nstat.rx_packets[i]; + ns->rx_errors[i] = mtcp->nstat.rx_errors[i] - mtcp->p_nstat.rx_errors[i]; + ns->rx_bytes[i] = mtcp->nstat.rx_bytes[i] - mtcp->p_nstat.rx_bytes[i]; + ns->tx_packets[i] = mtcp->nstat.tx_packets[i] - mtcp->p_nstat.tx_packets[i]; + ns->tx_drops[i] = mtcp->nstat.tx_drops[i] - mtcp->p_nstat.tx_drops[i]; + ns->tx_bytes[i] = mtcp->nstat.tx_bytes[i] - mtcp->p_nstat.tx_bytes[i]; +#if NETSTAT_PERTHREAD + if (CONFIG.eths[i].stat_print) { + fprintf(stderr, + "[CPU%2d] %s flows: %6u, " + "RX: %7ld(pps) (err: %5ld), %5.2lf(Gbps), " + "TX: %7ld(pps), %5.2lf(Gbps)\n", + mtcp->ctx->cpu, CONFIG.eths[i].dev_name, mtcp->flow_cnt, ns->rx_packets[i], ns->rx_errors[i], + GBPS(ns->rx_bytes[i]), ns->tx_packets[i], GBPS(ns->tx_bytes[i])); + } +#endif + } +#ifdef ENABLELRO + ns->rx_gdptbytes = mtcp->nstat.rx_gdptbytes - mtcp->p_nstat.rx_gdptbytes; + ns->tx_gdptbytes = mtcp->nstat.tx_gdptbytes - mtcp->p_nstat.tx_gdptbytes; +#endif + mtcp->p_nstat = mtcp->nstat; +} +/*----------------------------------------------------------------------------*/ +#if ROUND_STAT +static inline void PrintThreadRoundStats(mtcp_manager_t mtcp, struct run_stat *rs) +{ +#define ROUND_DIV (1000) + rs->rounds = mtcp->runstat.rounds - mtcp->p_runstat.rounds; + rs->rounds_rx = mtcp->runstat.rounds_rx - mtcp->p_runstat.rounds_rx; + rs->rounds_rx_try = mtcp->runstat.rounds_rx_try - mtcp->p_runstat.rounds_rx_try; + rs->rounds_tx = mtcp->runstat.rounds_tx - mtcp->p_runstat.rounds_tx; + rs->rounds_tx_try = mtcp->runstat.rounds_tx_try - mtcp->p_runstat.rounds_tx_try; + rs->rounds_select = mtcp->runstat.rounds_select - mtcp->p_runstat.rounds_select; + rs->rounds_select_rx = mtcp->runstat.rounds_select_rx - mtcp->p_runstat.rounds_select_rx; + rs->rounds_select_tx = mtcp->runstat.rounds_select_tx - mtcp->p_runstat.rounds_select_tx; + rs->rounds_select_intr = mtcp->runstat.rounds_select_intr - mtcp->p_runstat.rounds_select_intr; + rs->rounds_twcheck = mtcp->runstat.rounds_twcheck - mtcp->p_runstat.rounds_twcheck; + mtcp->p_runstat = mtcp->runstat; +#if NETSTAT_PERTHREAD + fprintf(stderr, + "[CPU%2d] Rounds: %4ldK, " + "rx: %3ldK (try: %4ldK), tx: %3ldK (try: %4ldK), " + "ps_select: %4ld (rx: %4ld, tx: %4ld, intr: %3ld)\n", + mtcp->ctx->cpu, rs->rounds / ROUND_DIV, rs->rounds_rx / ROUND_DIV, rs->rounds_rx_try / ROUND_DIV, + rs->rounds_tx / ROUND_DIV, rs->rounds_tx_try / ROUND_DIV, rs->rounds_select, rs->rounds_select_rx, + rs->rounds_select_tx, rs->rounds_select_intr); +#endif +} +#endif /* ROUND_STAT */ +/*----------------------------------------------------------------------------*/ +#endif /* NETSTAT */ +/*----------------------------------------------------------------------------*/ +#if EVENT_STAT +static inline void PrintEventStat(int core, struct mtcp_epoll_stat *stat) +{ + fprintf(stderr, + "[CPU%2d] calls: %lu, waits: %lu, wakes: %lu, " + "issued: %lu, registered: %lu, invalidated: %lu, handled: %lu\n", + core, stat->calls, stat->waits, stat->wakes, stat->issued, stat->registered, stat->invalidated, stat->handled); + memset(stat, 0, sizeof(struct mtcp_epoll_stat)); +} +#endif /* EVENT_STAT */ +/*----------------------------------------------------------------------------*/ +#ifdef NETSTAT +static inline void PrintNetworkStats(mtcp_manager_t mtcp, uint32_t cur_ts) +{ +#define TIMEOUT 1 + int i; + struct net_stat ns; +#if ROUND_STAT + struct run_stat rs; +#endif /* ROUND_STAT */ +#ifdef NETSTAT_TOTAL + int j; + uint32_t gflow_cnt = 0; + struct net_stat g_nstat; +#if ROUND_STAT + struct run_stat g_runstat; +#endif /* ROUND_STAT */ +#endif /* NETSTAT_TOTAL */ + + if (TS_TO_MSEC(cur_ts - mtcp->p_nstat_ts) < SEC_TO_MSEC(TIMEOUT)) { + return; + } + + mtcp->p_nstat_ts = cur_ts; + gflow_cnt = 0; + memset(&g_nstat, 0, sizeof(struct net_stat)); + for (i = 0; i < CONFIG.num_cores; i++) { + if (running[i]) { + PrintThreadNetworkStats(g_mtcp[i], &ns); +#if NETSTAT_TOTAL + gflow_cnt += g_mtcp[i]->flow_cnt; + for (j = 0; j < CONFIG.eths_num; j++) { + g_nstat.rx_packets[j] += ns.rx_packets[j]; + g_nstat.rx_errors[j] += ns.rx_errors[j]; + g_nstat.rx_bytes[j] += ns.rx_bytes[j]; + g_nstat.tx_packets[j] += ns.tx_packets[j]; + g_nstat.tx_drops[j] += ns.tx_drops[j]; + g_nstat.tx_bytes[j] += ns.tx_bytes[j]; + } +#ifdef ENABLELRO + g_nstat.rx_gdptbytes += ns.rx_gdptbytes; + g_nstat.tx_gdptbytes += ns.tx_gdptbytes; +#endif +#endif + } + } +#if NETSTAT_TOTAL + for (i = 0; i < CONFIG.eths_num; i++) { + if (CONFIG.eths[i].stat_print) { + fprintf(stderr, + "[ ALL ] %s flows: %6u, " + "RX: %7ld(pps) (err: %5ld), %5.2lf(Gbps), " + "TX: %7ld(pps), %5.2lf(Gbps)\n", + CONFIG.eths[i].dev_name, gflow_cnt, g_nstat.rx_packets[i], g_nstat.rx_errors[i], + GBPS(g_nstat.rx_bytes[i]), g_nstat.tx_packets[i], GBPS(g_nstat.tx_bytes[i])); + } + } +#ifdef ENABLELRO + fprintf(stderr, "[ ALL ] Goodput RX: %5.2lf(Gbps), TX: %5.2lf(Gbps)\n", GBPS(g_nstat.rx_gdptbytes), + GBPS(g_nstat.tx_gdptbytes)); +#endif +#endif + +#if ROUND_STAT + memset(&g_runstat, 0, sizeof(struct run_stat)); + for (i = 0; i < CONFIG.num_cores; i++) { + if (running[i]) { + PrintThreadRoundStats(g_mtcp[i], &rs); +#if 0 + g_runstat.rounds += rs.rounds; + g_runstat.rounds_rx += rs.rounds_rx; + g_runstat.rounds_rx_try += rs.rounds_rx_try; + g_runstat.rounds_tx += rs.rounds_tx; + g_runstat.rounds_tx_try += rs.rounds_tx_try; + g_runstat.rounds_select += rs.rounds_select; + g_runstat.rounds_select_rx += rs.rounds_select_rx; + g_runstat.rounds_select_tx += rs.rounds_select_tx; +#endif + } + } +#if 0 + fprintf(stderr, "[ ALL ] Rounds: %4ldK, " + "rx: %3ldK (try: %4ldK), tx: %3ldK (try: %4ldK), " + "ps_select: %4ld (rx: %4ld, tx: %4ld)\n", + g_runstat.rounds / 1000, g_runstat.rounds_rx / 1000, + g_runstat.rounds_rx_try / 1000, g_runstat.rounds_tx / 1000, + g_runstat.rounds_tx_try / 1000, g_runstat.rounds_select, + g_runstat.rounds_select_rx, g_runstat.rounds_select_tx); +#endif +#endif /* ROUND_STAT */ + +#if EVENT_STAT + for (i = 0; i < CONFIG.num_cores; i++) { + if (running[i] && g_mtcp[i]->ep) { + PrintEventStat(i, &g_mtcp[i]->ep->stat); + } + } +#endif + + fflush(stderr); +} +#endif /* NETSTAT */ +/*----------------------------------------------------------------------------*/ +#if BLOCKING_SUPPORT +static inline void FlushAcceptEvents(mtcp_manager_t mtcp) +{ + STAT_COUNT(mtcp->runstat.rounds_accept); + + pthread_mutex_lock(&mtcp->listener->accept_lock); + if (!StreamQueueIsEmpty(mtcp->listener->acceptq)) { + pthread_cond_signal(&mtcp->listener->accept_cond); + } + pthread_mutex_unlock(&mtcp->listener->accept_lock); +} +/*----------------------------------------------------------------------------*/ +static inline void FlushWriteEvents(mtcp_manager_t mtcp, int thresh) +{ + tcp_stream *walk; + tcp_stream *next, *last; + int cnt; + + STAT_COUNT(mtcp->runstat.rounds_write); + + /* Notify available sending buffer (recovered peer window) */ + cnt = 0; + walk = TAILQ_FIRST(&mtcp->snd_br_list); + last = TAILQ_LAST(&mtcp->snd_br_list, snd_br_head); + while (walk) { + if (++cnt > thresh) + break; + + next = TAILQ_NEXT(walk, sndvar->snd_br_link); + TRACE_LOOP("Inside send broadcasting list. cnt: %u\n", cnt); + TAILQ_REMOVE(&mtcp->snd_br_list, walk, sndvar->snd_br_link); + mtcp->snd_br_list_cnt--; + if (walk->on_snd_br_list) { + TRACE_SNDBUF("Broadcasting available sending buffer!\n"); + if (!(walk->epoll & MTCP_EPOLLOUT)) { + pthread_cond_signal(&walk->write_cond); + walk->on_snd_br_list = FALSE; + } + } + + if (walk == last) + break; + walk = next; + } +} +/*----------------------------------------------------------------------------*/ +static inline void FlushReadEvents(mtcp_manager_t mtcp, int thresh) +{ + tcp_stream *walk; + tcp_stream *next, *last; + int cnt; + + STAT_COUNT(mtcp->runstat.rounds_read); + + /* Notify receiving event */ + cnt = 0; + walk = TAILQ_FIRST(&mtcp->rcv_br_list); + last = TAILQ_LAST(&mtcp->rcv_br_list, rcv_br_head); + while (walk) { + if (++cnt > thresh) + break; + + next = TAILQ_NEXT(walk, rcvvar->rcv_br_link); + TRACE_LOOP("Inside recv broadcasting list. cnt: %u\n", cnt); + TAILQ_REMOVE(&mtcp->rcv_br_list, walk, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt--; + if (walk->on_rcv_br_list) { + if (!(walk->epoll & MTCP_EPOLLIN)) { + TRACE_TEMP("Broadcasting read contition\n"); + pthread_cond_signal(&walk->read_cond); + walk->on_rcv_br_list = FALSE; + } + } + + if (walk == last) + break; + walk = next; + } +} +#endif +/*----------------------------------------------------------------------------*/ +static inline void FlushEpollEvents(mtcp_manager_t mtcp, uint32_t cur_ts) +{ + struct mtcp_epoll *ep = mtcp->ep; + struct event_queue *usrq = ep->usr_queue; + struct event_queue *mtcpq = ep->mtcp_queue; + + pthread_mutex_lock(&ep->epoll_lock); + if (ep->mtcp_queue->num_events > 0) { + /* while mtcp_queue have events */ + /* and usr_queue is not full */ + while (mtcpq->num_events > 0 && usrq->num_events < usrq->size) { + /* copy the event from mtcp_queue to usr_queue */ + usrq->events[usrq->end++] = mtcpq->events[mtcpq->start++]; + + if (usrq->end >= usrq->size) + usrq->end = 0; + usrq->num_events++; + + if (mtcpq->start >= mtcpq->size) + mtcpq->start = 0; + mtcpq->num_events--; + } + } + + /* if there are pending events, wake up user */ + if (ep->waiting && (ep->usr_queue->num_events > 0 || ep->usr_shadow_queue->num_events > 0)) { + STAT_COUNT(mtcp->runstat.rounds_epoll); + TRACE_EPOLL("Broadcasting events. num: %d, cur_ts: %u, prev_ts: %u\n", ep->usr_queue->num_events, cur_ts, + mtcp->ts_last_event); + mtcp->ts_last_event = cur_ts; + ep->stat.wakes++; + pthread_cond_signal(&ep->epoll_cond); + } + pthread_mutex_unlock(&ep->epoll_lock); +} +/*----------------------------------------------------------------------------*/ +static inline void HandleApplicationCalls(mtcp_manager_t mtcp, uint32_t cur_ts) +{ + tcp_stream *stream; + int cnt, max_cnt; + int handled, delayed; + int control, send, ack; + + /* connect handling */ + while ((stream = StreamDequeue(mtcp->connectq))) { + AddtoControlList(mtcp, stream, cur_ts); + } + + /* send queue handling */ + while ((stream = StreamDequeue(mtcp->sendq))) { + stream->sndvar->on_sendq = FALSE; + AddtoSendList(mtcp, stream); + } + + /* ack queue handling */ + while ((stream = StreamDequeue(mtcp->ackq))) { + stream->sndvar->on_ackq = FALSE; + EnqueueACK(mtcp, stream, cur_ts, ACK_OPT_AGGREGATE); + } + + /* close handling */ + handled = delayed = 0; + control = send = ack = 0; + while ((stream = StreamDequeue(mtcp->closeq))) { + struct tcp_send_vars *sndvar = stream->sndvar; + sndvar->on_closeq = FALSE; + + if (sndvar->sndbuf) { + sndvar->fss = sndvar->sndbuf->head_seq + sndvar->sndbuf->len; + } else { + sndvar->fss = stream->snd_nxt; + } + + if (CONFIG.tcp_timeout > 0) + RemoveFromTimeoutList(mtcp, stream); + + if (stream->have_reset) { + handled++; + if (stream->state != TCP_ST_CLOSED) { + stream->close_reason = TCP_RESET; + stream->state = TCP_ST_CLOSED; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", stream->id); + DestroyTCPStream(mtcp, stream); + } else { + TRACE_ERROR("Stream already closed.\n"); + } + + } else if (sndvar->on_control_list) { + sndvar->on_closeq_int = TRUE; + StreamInternalEnqueue(mtcp->closeq_int, stream); + delayed++; + if (sndvar->on_control_list) + control++; + if (sndvar->on_send_list) + send++; + if (sndvar->on_ack_list) + ack++; + + } else if (sndvar->on_send_list || sndvar->on_ack_list) { + handled++; + if (stream->state == TCP_ST_ESTABLISHED) { + stream->state = TCP_ST_FIN_WAIT_1; + TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); + + } else if (stream->state == TCP_ST_CLOSE_WAIT) { + stream->state = TCP_ST_LAST_ACK; + TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); + } + stream->control_list_waiting = TRUE; + + } else if (stream->state != TCP_ST_CLOSED) { + handled++; + if (stream->state == TCP_ST_ESTABLISHED) { + stream->state = TCP_ST_FIN_WAIT_1; + TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); + + } else if (stream->state == TCP_ST_CLOSE_WAIT) { + stream->state = TCP_ST_LAST_ACK; + TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); + } + //sndvar->rto = TCP_FIN_RTO; + //UpdateRetransmissionTimer(mtcp, stream, mtcp->cur_ts); + AddtoControlList(mtcp, stream, cur_ts); + } else { + TRACE_ERROR("Already closed connection!\n"); + } + } + TRACE_ROUND("Handling close connections. cnt: %d\n", cnt); + + cnt = 0; + max_cnt = mtcp->closeq_int->count; + while (cnt++ < max_cnt) { + stream = StreamInternalDequeue(mtcp->closeq_int); + + if (stream->sndvar->on_control_list) { + StreamInternalEnqueue(mtcp->closeq_int, stream); + + } else if (stream->state != TCP_ST_CLOSED) { + handled++; + stream->sndvar->on_closeq_int = FALSE; + if (stream->state == TCP_ST_ESTABLISHED) { + stream->state = TCP_ST_FIN_WAIT_1; + TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); + + } else if (stream->state == TCP_ST_CLOSE_WAIT) { + stream->state = TCP_ST_LAST_ACK; + TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); + } + AddtoControlList(mtcp, stream, cur_ts); + } else { + stream->sndvar->on_closeq_int = FALSE; + TRACE_ERROR("Already closed connection!\n"); + } + } + + /* reset handling */ + while ((stream = StreamDequeue(mtcp->resetq))) { + stream->sndvar->on_resetq = FALSE; + + if (CONFIG.tcp_timeout > 0) + RemoveFromTimeoutList(mtcp, stream); + + if (stream->have_reset) { + if (stream->state != TCP_ST_CLOSED) { + stream->close_reason = TCP_RESET; + stream->state = TCP_ST_CLOSED; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", stream->id); + DestroyTCPStream(mtcp, stream); + } else { + TRACE_ERROR("Stream already closed.\n"); + } + + } else if (stream->sndvar->on_control_list || stream->sndvar->on_send_list || stream->sndvar->on_ack_list) { + /* wait until all the queues are flushed */ + stream->sndvar->on_resetq_int = TRUE; + StreamInternalEnqueue(mtcp->resetq_int, stream); + + } else { + if (stream->state != TCP_ST_CLOSED) { + stream->close_reason = TCP_ACTIVE_CLOSE; + stream->state = TCP_ST_CLOSED; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", stream->id); + AddtoControlList(mtcp, stream, cur_ts); + } else { + TRACE_ERROR("Stream already closed.\n"); + } + } + } + TRACE_ROUND("Handling reset connections. cnt: %d\n", cnt); + + cnt = 0; + max_cnt = mtcp->resetq_int->count; + while (cnt++ < max_cnt) { + stream = StreamInternalDequeue(mtcp->resetq_int); + + if (stream->sndvar->on_control_list || stream->sndvar->on_send_list || stream->sndvar->on_ack_list) { + /* wait until all the queues are flushed */ + StreamInternalEnqueue(mtcp->resetq_int, stream); + + } else { + stream->sndvar->on_resetq_int = FALSE; + + if (stream->state != TCP_ST_CLOSED) { + stream->close_reason = TCP_ACTIVE_CLOSE; + stream->state = TCP_ST_CLOSED; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", stream->id); + AddtoControlList(mtcp, stream, cur_ts); + } else { + TRACE_ERROR("Stream already closed.\n"); + } + } + } + + /* destroy streams in destroyq */ + while ((stream = StreamDequeue(mtcp->destroyq))) { + DestroyTCPStream(mtcp, stream); + } + + mtcp->wakeup_flag = FALSE; +} +/*----------------------------------------------------------------------------*/ +static inline void WritePacketsToChunks(mtcp_manager_t mtcp, uint32_t cur_ts) +{ + int thresh = CONFIG.max_concurrency; + int i; + + /* Set the threshold to CONFIG.max_concurrency to send ACK immediately */ + /* Otherwise, set to appropriate value (e.g. thresh) */ + assert(mtcp->g_sender != NULL); + if (mtcp->g_sender->control_list_cnt) + WriteTCPControlList(mtcp, mtcp->g_sender, cur_ts, thresh); + if (mtcp->g_sender->ack_list_cnt) + WriteTCPACKList(mtcp, mtcp->g_sender, cur_ts, thresh); + if (mtcp->g_sender->send_list_cnt) + WriteTCPDataList(mtcp, mtcp->g_sender, cur_ts, thresh); + + for (i = 0; i < CONFIG.eths_num; i++) { + assert(mtcp->n_sender[i] != NULL); + if (mtcp->n_sender[i]->control_list_cnt) + WriteTCPControlList(mtcp, mtcp->n_sender[i], cur_ts, thresh); + if (mtcp->n_sender[i]->ack_list_cnt) + WriteTCPACKList(mtcp, mtcp->n_sender[i], cur_ts, thresh); + if (mtcp->n_sender[i]->send_list_cnt) + WriteTCPDataList(mtcp, mtcp->n_sender[i], cur_ts, thresh); + } +} +/*----------------------------------------------------------------------------*/ +#if TESTING +static int DestroyRemainingFlows(mtcp_manager_t mtcp) +{ + struct hashtable *ht = mtcp->tcp_flow_table; + tcp_stream *walk; + int cnt, i; + + cnt = 0; +#if 0 + thread_printf(mtcp, mtcp->log_fp, + "CPU %d: Flushing remaining flows.\n", mtcp->ctx->cpu); +#endif + for (i = 0; i < NUM_BINS; i++) { + TAILQ_FOREACH(walk, &ht->ht_table[i], rcvvar->he_link) + { +#ifdef DUMP_STREAM + thread_printf(mtcp, mtcp->log_fp, "CPU %d: Destroying stream %d\n", mtcp->ctx->cpu, walk->id); + DumpStream(mtcp, walk); +#endif + DestroyTCPStream(mtcp, walk); + cnt++; + } + } + + return cnt; +} +#endif +/*----------------------------------------------------------------------------*/ +static void InterruptApplication(mtcp_manager_t mtcp) +{ + int i; + struct tcp_listener *listener = NULL; + + /* interrupt if the mtcp_epoll_wait() is waiting */ + if (mtcp->ep) { + pthread_mutex_lock(&mtcp->ep->epoll_lock); + if (mtcp->ep->waiting) { + pthread_cond_signal(&mtcp->ep->epoll_cond); + } + pthread_mutex_unlock(&mtcp->ep->epoll_lock); + } + + /* interrupt if the accept() is waiting */ + /* this may be a looong loop but this is called only on exit */ + for (i = 0; i < MAX_PORT; i++) { + listener = ListenerHTSearch(mtcp->listeners, &i); + if (listener != NULL) { + pthread_mutex_lock(&listener->accept_lock); + if (!(listener->socket->opts & MTCP_NONBLOCK)) { + pthread_cond_signal(&listener->accept_cond); + } + pthread_mutex_unlock(&listener->accept_lock); + } + } +} +/*----------------------------------------------------------------------------*/ +static void RunMainLoop(struct mtcp_thread_context *ctx) +{ + mtcp_manager_t mtcp = ctx->mtcp_manager; + int i; + int recv_cnt; + int rx_inf, tx_inf; + struct timeval cur_ts = { 0 }; + uint32_t ts, ts_prev; + int thresh; + + gettimeofday(&cur_ts, NULL); + TRACE_DBG("CPU %d: mtcp thread running.\n", ctx->cpu); + + ts = ts_prev = 0; + while ((!ctx->done || mtcp->flow_cnt) && !ctx->exit) { + STAT_COUNT(mtcp->runstat.rounds); + recv_cnt = 0; + + gettimeofday(&cur_ts, NULL); + ts = TIMEVAL_TO_TS(&cur_ts); + mtcp->cur_ts = ts; + for (rx_inf = 0; rx_inf < CONFIG.eths_num; rx_inf++) { + static uint16_t len; + static uint8_t *pktbuf; + recv_cnt = mtcp->iom->recv_pkts(ctx, rx_inf); + STAT_COUNT(mtcp->runstat.rounds_rx_try); + + for (i = 0; i < recv_cnt; i++) { + pktbuf = mtcp->iom->get_rptr(mtcp->ctx, rx_inf, i, &len); + if (pktbuf != NULL) { + if (ProcessPacket(mtcp, rx_inf, ts, pktbuf, len) != TRUE) + mtcp->iom->release_pkt(mtcp->ctx, rx_inf, pktbuf, len); + } +#ifdef NETSTAT + else + mtcp->nstat.rx_errors[rx_inf]++; +#endif + } +#ifndef DISABLE_AFXDP + mtcp->iom->drop_pkts(mtcp->ctx); +#endif + } + STAT_COUNT(mtcp->runstat.rounds_rx); + + /* interaction with application */ + if (mtcp->flow_cnt > 0) { + /* check retransmission timeout and timewait expire */ +#if 0 + thresh = (int)mtcp->flow_cnt / (TS_TO_USEC(PER_STREAM_TCHECK)); + assert(thresh >= 0); + if (thresh == 0) + thresh = 1; + if (recv_cnt > 0 && thresh > recv_cnt) + thresh = recv_cnt; +#endif + thresh = CONFIG.max_concurrency; + + /* Eunyoung, you may fix this later + * if there is no rcv packet, we will send as much as possible + */ + if (thresh == -1) + thresh = CONFIG.max_concurrency; + + CheckRtmTimeout(mtcp, ts, thresh); + CheckTimewaitExpire(mtcp, ts, CONFIG.max_concurrency); + + if (CONFIG.tcp_timeout > 0 && ts != ts_prev) { + CheckConnectionTimeout(mtcp, ts, thresh); + } + } + + /* if epoll is in use, flush all the queued events */ + if (mtcp->ep) { + FlushEpollEvents(mtcp, ts); + } + + if (mtcp->flow_cnt > 0) { + /* hadnle stream queues */ + HandleApplicationCalls(mtcp, ts); + } + + WritePacketsToChunks(mtcp, ts); + + /* send packets from write buffer */ + /* send until tx is available */ + for (tx_inf = 0; tx_inf < CONFIG.eths_num; tx_inf++) { + mtcp->iom->send_pkts(ctx, tx_inf); + } + + if (ts != ts_prev) { + ts_prev = ts; + if (ctx->cpu == mtcp_master) { + ARPTimer(mtcp, ts); +#ifdef NETSTAT + PrintNetworkStats(mtcp, ts); +#endif + } + } + + mtcp->iom->select(ctx); + + if (ctx->interrupt) { + InterruptApplication(mtcp); + } + } + +#if TESTING + DestroyRemainingFlows(mtcp); +#endif + + TRACE_DBG("MTCP thread %d out of main loop.\n", ctx->cpu); + /* flush logs */ + flush_log_data(mtcp); + TRACE_DBG("MTCP thread %d flushed logs.\n", ctx->cpu); + InterruptApplication(mtcp); + TRACE_INFO("MTCP thread %d finished.\n", ctx->cpu); +} +/*----------------------------------------------------------------------------*/ +static struct mtcp_sender *CreateMTCPSender(int ifidx) +{ + struct mtcp_sender *sender; + + sender = (struct mtcp_sender *)calloc(1, sizeof(struct mtcp_sender)); + if (!sender) { + return NULL; + } + + sender->ifidx = ifidx; + + TAILQ_INIT(&sender->control_list); + TAILQ_INIT(&sender->send_list); + TAILQ_INIT(&sender->ack_list); + + sender->control_list_cnt = 0; + sender->send_list_cnt = 0; + sender->ack_list_cnt = 0; + + return sender; +} +/*----------------------------------------------------------------------------*/ +static void DestroyMTCPSender(struct mtcp_sender *sender) +{ + free(sender); +} +/*----------------------------------------------------------------------------*/ +static mtcp_manager_t InitializeMTCPManager(struct mtcp_thread_context *ctx) +{ + mtcp_manager_t mtcp; + char log_name[MAX_FILE_NAME]; + int i; + + mtcp = (mtcp_manager_t)calloc(1, sizeof(struct mtcp_manager)); + if (!mtcp) { + perror("malloc"); + fprintf(stderr, "Failed to allocate mtcp_manager.\n"); + return NULL; + } + g_mtcp[ctx->cpu] = mtcp; + + mtcp->tcp_flow_table = CreateHashtable(HashFlow, EqualFlow, NUM_BINS_FLOWS); + if (!mtcp->tcp_flow_table) { + CTRACE_ERROR("Falied to allocate tcp flow table.\n"); + return NULL; + } + +#if USE_CCP + mtcp->tcp_sid_table = CreateHashtable(HashSID, EqualSID, NUM_BINS_FLOWS); + if (!mtcp->tcp_sid_table) { + CTRACE_ERROR("Failed to allocate tcp sid lookup table.\n"); + return NULL; + } +#endif + + mtcp->listeners = CreateHashtable(HashListener, EqualListener, NUM_BINS_LISTENERS); + if (!mtcp->listeners) { + CTRACE_ERROR("Failed to allocate listener table.\n"); + return NULL; + } + + mtcp->ctx = ctx; +#if !defined(DISABLE_DPDK) && !ENABLE_ONVM + char pool_name[RTE_MEMPOOL_NAMESIZE]; + sprintf(pool_name, "flow_pool_%d", ctx->cpu); + mtcp->flow_pool = MPCreate(pool_name, sizeof(tcp_stream), sizeof(tcp_stream) * CONFIG.max_concurrency); + if (!mtcp->flow_pool) { + CTRACE_ERROR("Failed to allocate tcp flow pool.\n"); + return NULL; + } + sprintf(pool_name, "rv_pool_%d", ctx->cpu); + mtcp->rv_pool = MPCreate(pool_name, sizeof(struct tcp_recv_vars), sizeof(struct tcp_recv_vars) * CONFIG.max_concurrency); + if (!mtcp->rv_pool) { + CTRACE_ERROR("Failed to allocate tcp recv variable pool.\n"); + return NULL; + } + sprintf(pool_name, "sv_pool_%d", ctx->cpu); + mtcp->sv_pool = MPCreate(pool_name, sizeof(struct tcp_send_vars), sizeof(struct tcp_send_vars) * CONFIG.max_concurrency); + if (!mtcp->sv_pool) { + CTRACE_ERROR("Failed to allocate tcp send variable pool.\n"); + return NULL; + } +#else + mtcp->flow_pool = MPCreate(sizeof(tcp_stream), sizeof(tcp_stream) * CONFIG.max_concurrency); + if (!mtcp->flow_pool) { + CTRACE_ERROR("Failed to allocate tcp flow pool.\n"); + return NULL; + } + mtcp->rv_pool = MPCreate(sizeof(struct tcp_recv_vars), sizeof(struct tcp_recv_vars) * CONFIG.max_concurrency); + if (!mtcp->rv_pool) { + CTRACE_ERROR("Failed to allocate tcp recv variable pool.\n"); + return NULL; + } + mtcp->sv_pool = MPCreate(sizeof(struct tcp_send_vars), sizeof(struct tcp_send_vars) * CONFIG.max_concurrency); + if (!mtcp->sv_pool) { + CTRACE_ERROR("Failed to allocate tcp send variable pool.\n"); + return NULL; + } +#endif + mtcp->rbm_snd = SBManagerCreate(mtcp, CONFIG.sndbuf_size, CONFIG.max_num_buffers); + if (!mtcp->rbm_snd) { + CTRACE_ERROR("Failed to create send ring buffer.\n"); + return NULL; + } + + mtcp->rbm_rcv = RBManagerCreate(mtcp, CONFIG.rcvbuf_size, CONFIG.max_num_buffers); + if (!mtcp->rbm_rcv) { + CTRACE_ERROR("Failed to create recv ring buffer.\n"); + return NULL; + } + + InitializeTCPStreamManager(); + + mtcp->smap = (socket_map_t)calloc(CONFIG.max_concurrency, sizeof(struct socket_map)); + if (!mtcp->smap) { + perror("calloc"); + CTRACE_ERROR("Failed to allocate memory for stream map.\n"); + return NULL; + } + TAILQ_INIT(&mtcp->free_smap); + for (i = 0; i < CONFIG.max_concurrency; i++) { + mtcp->smap[i].id = i; + mtcp->smap[i].socktype = MTCP_SOCK_UNUSED; + memset(&mtcp->smap[i].saddr, 0, sizeof(struct sockaddr_in)); + mtcp->smap[i].stream = NULL; + TAILQ_INSERT_TAIL(&mtcp->free_smap, &mtcp->smap[i], free_smap_link); + } + + mtcp->ep = NULL; + + snprintf(log_name, MAX_FILE_NAME, LOG_FILE_NAME "_%d", ctx->cpu); + mtcp->log_fp = fopen(log_name, "w"); + if (!mtcp->log_fp) { + perror("fopen"); + CTRACE_ERROR("Failed to create file for logging.\n"); + return NULL; + } + mtcp->sp_fd = g_logctx[ctx->cpu]->pair_sp_fd; + mtcp->logger = g_logctx[ctx->cpu]; + + mtcp->connectq = CreateStreamQueue(BACKLOG_SIZE); + if (!mtcp->connectq) { + CTRACE_ERROR("Failed to create connect queue.\n"); + return NULL; + } + mtcp->sendq = CreateStreamQueue(CONFIG.max_concurrency); + if (!mtcp->sendq) { + CTRACE_ERROR("Failed to create send queue.\n"); + return NULL; + } + mtcp->ackq = CreateStreamQueue(CONFIG.max_concurrency); + if (!mtcp->ackq) { + CTRACE_ERROR("Failed to create ack queue.\n"); + return NULL; + } + mtcp->closeq = CreateStreamQueue(CONFIG.max_concurrency); + if (!mtcp->closeq) { + CTRACE_ERROR("Failed to create close queue.\n"); + return NULL; + } + mtcp->closeq_int = CreateInternalStreamQueue(CONFIG.max_concurrency); + if (!mtcp->closeq_int) { + CTRACE_ERROR("Failed to create close queue.\n"); + return NULL; + } + mtcp->resetq = CreateStreamQueue(CONFIG.max_concurrency); + if (!mtcp->resetq) { + CTRACE_ERROR("Failed to create reset queue.\n"); + return NULL; + } + mtcp->resetq_int = CreateInternalStreamQueue(CONFIG.max_concurrency); + if (!mtcp->resetq_int) { + CTRACE_ERROR("Failed to create reset queue.\n"); + return NULL; + } + mtcp->destroyq = CreateStreamQueue(CONFIG.max_concurrency); + if (!mtcp->destroyq) { + CTRACE_ERROR("Failed to create destroy queue.\n"); + return NULL; + } + + mtcp->g_sender = CreateMTCPSender(-1); + if (!mtcp->g_sender) { + CTRACE_ERROR("Failed to create global sender structure.\n"); + return NULL; + } + for (i = 0; i < CONFIG.eths_num; i++) { + mtcp->n_sender[i] = CreateMTCPSender(i); + if (!mtcp->n_sender[i]) { + CTRACE_ERROR("Failed to create per-nic sender structure.\n"); + return NULL; + } + } + + mtcp->rto_store = InitRTOHashstore(); + TAILQ_INIT(&mtcp->timewait_list); + TAILQ_INIT(&mtcp->timeout_list); + +#if BLOCKING_SUPPORT + TAILQ_INIT(&mtcp->rcv_br_list); + TAILQ_INIT(&mtcp->snd_br_list); +#endif + + return mtcp; +} +/*----------------------------------------------------------------------------*/ +#if USE_CCP + +uint32_t libstartccp_run_forever(const char *alg_to_run, uint32_t log_fd); + +static void *CCPRunThread(void *arg) +{ + // Add ipc argument (always unix, so no need for user to provide manually) + char args[1024] = { 0 }; + int arglen = strlen(CONFIG.cc) - 1; + strncpy(args, CONFIG.cc, arglen); + strncpy(args + arglen, " --ipc=unix", 11); + args[arglen + 11] = '\0'; + + // Open fd for log file + FILE *ccp_log = fopen("cc.log", "w"); + if (ccp_log == NULL) { + perror("fopen cc.log"); + return 0; + } + TRACE_CCP("starting ccp thread with args: %s\n", args); + TRACE_CCP("printing output to ./cc.log\n"); + libstartccp_run_forever(args, fileno(ccp_log)); + + fclose(ccp_log); + + return 0; +} + +static void *CCPRecvLoopThread(void *arg) +{ + mtcp_manager_t mtcp = (mtcp_manager_t)arg; + mtcp_thread_context_t ctx = mtcp->ctx; + + int cpu = ctx->cpu; + mtcp_core_affinitize(cpu); + + TRACE_CCP("ccp recv loop thread started on cpu %d\n", cpu); + + char recvBuf[CCP_MAX_MSG_SIZE]; + int bytes_recvd; + while (!ctx->done && !ctx->exit) { + do { + bytes_recvd = recvfrom(mtcp->from_ccp, recvBuf, CCP_MAX_MSG_SIZE, 0, NULL, NULL); + if (bytes_recvd <= 0) { + if (bytes_recvd < 0) { + TRACE_ERROR("recv returned %d\n", bytes_recvd); + } + break; + } + if (!mtcp->to_ccp) { + setup_ccp_send_socket(mtcp); + } + ccp_read_msg(recvBuf, bytes_recvd); + } while (1); + } + return 0; +} +#endif +/*----------------------------------------------------------------------------*/ +static void *MTCPRunThread(void *arg) +{ + mctx_t mctx = (mctx_t)arg; + int cpu = mctx->cpu; + int working; + struct mtcp_manager *mtcp; + struct mtcp_thread_context *ctx; + + /* affinitize the thread to this core first */ +#ifndef DISABLE_DPDK + if (rte_lcore_id() == LCORE_ID_ANY) +#endif + { + mtcp_core_affinitize(cpu); + } + + /* memory alloc after core affinitization would use local memory + most time */ + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + perror("calloc"); + TRACE_ERROR("Failed to calloc mtcp context.\n"); + exit(-1); + } + ctx->thread = pthread_self(); + ctx->cpu = cpu; + mtcp = ctx->mtcp_manager = InitializeMTCPManager(ctx); + if (!mtcp) { + TRACE_ERROR("Failed to initialize mtcp manager.\n"); + exit(-1); + } + + /* assign mtcp context's underlying I/O module */ + mtcp->iom = current_iomodule_func; + + /* I/O initializing */ + mtcp->iom->init_handle(ctx); + + if (pthread_mutex_init(&ctx->smap_lock, NULL)) { + perror("pthread_mutex_init of ctx->smap_lock\n"); + exit(-1); + } + + if (pthread_mutex_init(&ctx->flow_pool_lock, NULL)) { + perror("pthread_mutex_init of ctx->flow_pool_lock\n"); + exit(-1); + } + + if (pthread_mutex_init(&ctx->socket_pool_lock, NULL)) { + perror("pthread_mutex_init of ctx->socket_pool_lock\n"); + exit(-1); + } + + SQ_LOCK_INIT(&ctx->connect_lock, "ctx->connect_lock", exit(-1)); + SQ_LOCK_INIT(&ctx->close_lock, "ctx->close_lock", exit(-1)); + SQ_LOCK_INIT(&ctx->reset_lock, "ctx->reset_lock", exit(-1)); + SQ_LOCK_INIT(&ctx->sendq_lock, "ctx->sendq_lock", exit(-1)); + SQ_LOCK_INIT(&ctx->ackq_lock, "ctx->ackq_lock", exit(-1)); + SQ_LOCK_INIT(&ctx->destroyq_lock, "ctx->destroyq_lock", exit(-1)); + + /* remember this context pointer for signal processing */ + g_pctx[cpu] = ctx; + mlockall(MCL_CURRENT); + +#if USE_CCP + setup_ccp_connection(mtcp); + + if (cpu == 0 && pthread_create(&ccp_run_thread, NULL, CCPRunThread, (void *)mtcp) != 0) { + TRACE_ERROR("Failed to create thread running CCP on cpu 0"); + } + if (pthread_create(&ccp_recv_thread[cpu], NULL, CCPRecvLoopThread, (void *)mtcp) != 0) { + TRACE_ERROR("Failed to create thread for CCP receive loop on cpu %d\n", cpu); + return NULL; + } +#endif + + // attach (nic device, queue) + working = AttachDevice(ctx); + if (working != 0) { + perror("attach"); + return NULL; + } + + TRACE_DBG("CPU %d: initialization finished.\n", cpu); + + fprintf(stderr, "CPU %d: initialization finished.\n", cpu); + + sem_post(&g_init_sem[ctx->cpu]); + + /* start the main loop */ + RunMainLoop(ctx); + + struct mtcp_context m; + m.cpu = cpu; + mtcp_free_context(&m); + /* destroy hash tables */ + DestroyHashtable(g_mtcp[cpu]->tcp_flow_table); +#if USE_CCP + DestroyHashtable(g_mtcp[cpu]->tcp_sid_table); +#endif + DestroyHashtable(g_mtcp[cpu]->listeners); + + TRACE_DBG("MTCP thread %d finished.\n", ctx->cpu); + + return 0; +} +/*----------------------------------------------------------------------------*/ +#ifndef DISABLE_DPDK +int MTCPDPDKRunThread(void *arg) +{ + MTCPRunThread(arg); + return 0; +} +#endif +/*----------------------------------------------------------------------------*/ +mctx_t mtcp_create_context(int cpu) +{ + mctx_t mctx; + int ret; + + if (cpu >= CONFIG.num_cores) { + TRACE_ERROR("Failed initialize new mtcp context. " + "Requested cpu id %d exceed the number of cores %d configured to use.\n", + cpu, CONFIG.num_cores); + return NULL; + } + + /* check if mtcp_create_context() was already initialized */ + if (g_logctx[cpu] != NULL) { + TRACE_ERROR("%s was already initialized before!\n", __FUNCTION__); + return NULL; + } + + ret = sem_init(&g_init_sem[cpu], 0, 0); + if (ret) { + TRACE_ERROR("Failed initialize init_sem.\n"); + return NULL; + } + + mctx = (mctx_t)calloc(1, sizeof(struct mtcp_context)); + if (!mctx) { + TRACE_ERROR("Failed to allocate memory for mtcp_context.\n"); + return NULL; + } + mctx->cpu = cpu; + + /* initialize logger */ + g_logctx[cpu] = (struct log_thread_context *)calloc(1, sizeof(struct log_thread_context)); + if (!g_logctx[cpu]) { + perror("calloc"); + TRACE_ERROR("Failed to allocate memory for log thread context.\n"); + free(mctx); + return NULL; + } + InitLogThreadContext(g_logctx[cpu], cpu); +#if defined(PKTDUMP) || defined(DBGMSG) || defined(DBGFUNC) || defined(STREAM) || defined(STATE) || defined(STAT) || defined(APP) || \ + defined(EPOLL) || defined(DUMP_STREAM) + if (pthread_create(&log_thread[cpu], NULL, ThreadLogMain, (void *)g_logctx[cpu])) { + perror("pthread_create"); + TRACE_ERROR("Failed to create log thread\n"); + free(g_logctx[cpu]); + free(mctx); + return NULL; + } +#endif +#ifndef DISABLE_DPDK + /* Wake up mTCP threads (wake up I/O threads) */ + if (current_iomodule_func == &dpdk_module_func) { + int master; + master = rte_get_master_lcore(); + + if (master == whichCoreID(cpu)) { + lcore_config[master].ret = 0; + lcore_config[master].state = FINISHED; + + if (pthread_create(&g_thread[cpu], NULL, MTCPRunThread, (void *)mctx) != 0) { + TRACE_ERROR("pthread_create of mtcp thread failed!\n"); + return NULL; + } + } else + rte_eal_remote_launch(MTCPDPDKRunThread, mctx, whichCoreID(cpu)); + } else +#endif + { + if (pthread_create(&g_thread[cpu], NULL, MTCPRunThread, (void *)mctx) != 0) { + TRACE_ERROR("pthread_create of mtcp thread failed!\n"); + return NULL; + } + } + + sem_wait(&g_init_sem[cpu]); + sem_destroy(&g_init_sem[cpu]); + + running[cpu] = TRUE; + + if (mtcp_master < 0) { + mtcp_master = cpu; + TRACE_INFO("CPU %d is now the master thread.\n", mtcp_master); + } + + return mctx; +} +/*----------------------------------------------------------------------------*/ +void mtcp_destroy_context(mctx_t mctx) +{ + struct mtcp_thread_context *ctx = g_pctx[mctx->cpu]; + if (ctx != NULL) + ctx->done = 1; + free(mctx); +} +/*----------------------------------------------------------------------------*/ +void mtcp_free_context(mctx_t mctx) +{ + struct mtcp_thread_context *ctx = g_pctx[mctx->cpu]; + struct mtcp_manager *mtcp = ctx->mtcp_manager; + struct log_thread_context *log_ctx = mtcp->logger; + int ret, i; + + if (g_pctx[mctx->cpu] == NULL) + return; + + flush_log_data(mtcp); + + TRACE_DBG("CPU %d: mtcp_destroy_context()\n", mctx->cpu); + + /* close all stream sockets that are still open */ + if (!ctx->exit) { + for (i = 0; i < CONFIG.max_concurrency; i++) { + if (mtcp->smap[i].socktype == MTCP_SOCK_STREAM) { + TRACE_DBG("Closing remaining socket %d (%s)\n", i, TCPStateToString(mtcp->smap[i].stream)); +#ifdef DUMP_STREAM + DumpStream(mtcp, mtcp->smap[i].stream); +#endif + mtcp_close(mctx, i); + } + } + } + + ctx->done = 1; + + //pthread_kill(g_thread[mctx->cpu], SIGINT); + TRACE_INFO("MTCP thread %d joined.\n", mctx->cpu); + running[mctx->cpu] = FALSE; + + if (mtcp_master == mctx->cpu) { + for (i = 0; i < num_cpus; i++) { + if (i != mctx->cpu && running[i]) { + mtcp_master = i; + break; + } + } + } + + log_ctx->done = 1; + ret = write(log_ctx->pair_sp_fd, "F", 1); + assert(ret == 1); + UNUSED(ret); +#if defined(PKTDUMP) || defined(DBGMSG) || defined(DBGFUNC) || defined(STREAM) || defined(STATE) || defined(STAT) || defined(APP) || \ + defined(EPOLL) || defined(DUMP_STREAM) + pthread_join(log_thread[ctx->cpu], NULL); +#endif + fclose(mtcp->log_fp); + TRACE_LOG("Log thread %d joined.\n", mctx->cpu); + +#if USE_CCP + destroy_ccp_connection(mtcp); + close(mtcp->from_ccp); + close(mtcp->to_ccp); + TRACE_CCP("CCP thread %d joined.\n", mctx->cpu); +#endif + + if (mtcp->connectq) { + DestroyStreamQueue(mtcp->connectq); + mtcp->connectq = NULL; + } + if (mtcp->sendq) { + DestroyStreamQueue(mtcp->sendq); + mtcp->sendq = NULL; + } + if (mtcp->ackq) { + DestroyStreamQueue(mtcp->ackq); + mtcp->ackq = NULL; + } + if (mtcp->closeq) { + DestroyStreamQueue(mtcp->closeq); + mtcp->closeq = NULL; + } + if (mtcp->closeq_int) { + DestroyInternalStreamQueue(mtcp->closeq_int); + mtcp->closeq_int = NULL; + } + if (mtcp->resetq) { + DestroyStreamQueue(mtcp->resetq); + mtcp->resetq = NULL; + } + if (mtcp->resetq_int) { + DestroyInternalStreamQueue(mtcp->resetq_int); + mtcp->resetq_int = NULL; + } + if (mtcp->destroyq) { + DestroyStreamQueue(mtcp->destroyq); + mtcp->destroyq = NULL; + } + + DestroyMTCPSender(mtcp->g_sender); + for (i = 0; i < CONFIG.eths_num; i++) { + DestroyMTCPSender(mtcp->n_sender[i]); + } + + MPDestroy(mtcp->rv_pool); + MPDestroy(mtcp->sv_pool); + MPDestroy(mtcp->flow_pool); + + if (mtcp->ap) { + DestroyAddressPool(mtcp->ap); + mtcp->ap = NULL; + } + + SQ_LOCK_DESTROY(&ctx->connect_lock); + SQ_LOCK_DESTROY(&ctx->close_lock); + SQ_LOCK_DESTROY(&ctx->reset_lock); + SQ_LOCK_DESTROY(&ctx->sendq_lock); + SQ_LOCK_DESTROY(&ctx->ackq_lock); + SQ_LOCK_DESTROY(&ctx->destroyq_lock); + + //TRACE_INFO("MTCP thread %d destroyed.\n", mctx->cpu); + mtcp->iom->destroy_handle(ctx); + free(ctx); + if (g_logctx[mctx->cpu]) { + free(g_logctx[mctx->cpu]); + g_logctx[mctx->cpu] = NULL; + } + g_pctx[mctx->cpu] = NULL; +} +/*----------------------------------------------------------------------------*/ +mtcp_sighandler_t mtcp_register_signal(int signum, mtcp_sighandler_t handler) +{ + mtcp_sighandler_t prev; + + if (signum == SIGINT) { + prev = app_signal_handler; + app_signal_handler = handler; + } else { + if ((prev = signal(signum, handler)) == SIG_ERR) { + perror("signal"); + return SIG_ERR; + } + } + + return prev; +} +/*----------------------------------------------------------------------------*/ +int mtcp_getconf(struct mtcp_conf *conf) +{ + if (!conf) + return -1; + + conf->num_cores = CONFIG.num_cores; + conf->max_concurrency = CONFIG.max_concurrency; + + conf->max_num_buffers = CONFIG.max_num_buffers; + conf->rcvbuf_size = CONFIG.rcvbuf_size; + conf->sndbuf_size = CONFIG.sndbuf_size; + + conf->tcp_timewait = CONFIG.tcp_timewait; + conf->tcp_timeout = CONFIG.tcp_timeout; + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_setconf(const struct mtcp_conf *conf) +{ + if (!conf) + return -1; + + if (conf->num_cores > 0) + CONFIG.num_cores = conf->num_cores; + if (conf->max_concurrency > 0) + CONFIG.max_concurrency = conf->max_concurrency; + if (conf->max_num_buffers > 0) + CONFIG.max_num_buffers = conf->max_num_buffers; + if (conf->rcvbuf_size > 0) + CONFIG.rcvbuf_size = conf->rcvbuf_size; + if (conf->sndbuf_size > 0) + CONFIG.sndbuf_size = conf->sndbuf_size; + + if (conf->tcp_timewait > 0) + CONFIG.tcp_timewait = conf->tcp_timewait; + if (conf->tcp_timeout > 0) + CONFIG.tcp_timeout = conf->tcp_timeout; + + TRACE_CONFIG("Configuration updated by mtcp_setconf().\n"); + //PrintConfiguration(); + + return 0; +} + +/*----------------------------------------------------------------------------*/ +int mtcp_init(const char *config_file) +{ + int i; + int ret; + + /* getting cpu and NIC */ + /* set to max cpus only if user has not arbitrarily set it to lower # */ + num_cpus = (CONFIG.num_cores == 0) ? GetNumCPUs() : CONFIG.num_cores; + + assert(num_cpus >= 1); + + if (num_cpus > MAX_CPUS) { + TRACE_ERROR("You cannot run mTCP with more than %d cores due " + "to your static mTCP configuration. Please disable " + "the last %d cores in your system.\n", + MAX_CPUS, num_cpus - MAX_CPUS); + exit(EXIT_FAILURE); + } + +#if 0 + /* TODO: Enable this macro if cross-machine comm. with onvm client/server fails */ + if (num_cpus > 1) { + TRACE_ERROR("You cannot run mTCP application with more than 1 " + "core when you are using ONVM driver\n"); + exit(EXIT_FAILURE); + } +#endif + + for (i = 0; i < num_cpus; i++) { + g_mtcp[i] = NULL; + running[i] = FALSE; + sigint_cnt[i] = 0; + } + + ret = LoadConfiguration(config_file); + if (ret) { + TRACE_CONFIG("Error occured while loading configuration.\n"); + return -1; + } + PrintConfiguration(); + + for (i = 0; i < CONFIG.eths_num; i++) { + ap[i] = CreateAddressPool(CONFIG.eths[i].ip_addr, 1); + if (!ap[i]) { + TRACE_CONFIG("Error occured while create address pool[%d]\n", i); + return -1; + } + } + + PrintInterfaceInfo(); + + ret = SetRoutingTable(); + if (ret) { + TRACE_CONFIG("Error occured while loading routing table.\n"); + return -1; + } + PrintRoutingTable(); + + LoadARPTable(); + PrintARPTable(); + + if (signal(SIGUSR1, HandleSignal) == SIG_ERR) { + perror("signal, SIGUSR1"); + return -1; + } + if (signal(SIGINT, HandleSignal) == SIG_ERR) { + perror("signal, SIGINT"); + return -1; + } + app_signal_handler = NULL; + + /* load system-wide io module specs */ + current_iomodule_func->load_module(); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void mtcp_destroy(void) +{ + int i; +#ifndef DISABLE_DPDK + int master = rte_get_master_lcore(); +#endif + /* wait until all threads are closed */ + for (i = 0; i < num_cpus; i++) { + if (running[i]) { +#ifndef DISABLE_DPDK + if (master != i) + rte_eal_wait_lcore(i); + else +#endif + { + pthread_join(g_thread[i], NULL); + } + } + } + + for (i = 0; i < CONFIG.eths_num; i++) + DestroyAddressPool(ap[i]); + +#ifndef DISABLE_DPDK + mpz_clear(CONFIG._cpumask); +#endif + +#ifdef ENABLE_ONVM + onvm_nflib_stop(CONFIG.nf_local_ctx); +#endif + + TRACE_INFO("All MTCP threads are joined.\n"); +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/cpu.c b/lib/flash/mtcp/cpu.c new file mode 100644 index 0000000..e027ebc --- /dev/null +++ b/lib/flash/mtcp/cpu.c @@ -0,0 +1,144 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include "mtcp_api.h" +#ifndef DISABLE_DPDK +#include +#include +#include +#include +#include +#include +#endif + +#define MAX_FILE_NAME 1024 + +/*----------------------------------------------------------------------------*/ +inline int GetNumCPUs(void) +{ + return sysconf(_SC_NPROCESSORS_ONLN); +} +/*----------------------------------------------------------------------------*/ +static pid_t Gettid(void) +{ + return syscall(__NR_gettid); +} +/*----------------------------------------------------------------------------*/ +inline int whichCoreID(int thread_no) +{ +#ifndef DISABLE_DPDK + int i, cpu_id; + if (mpz_get_ui(CONFIG._cpumask) == 0) + return thread_no; + else { + int limit = mpz_popcount(CONFIG._cpumask); + + for (cpu_id = 0, i = 0; i < limit; cpu_id++) + if (mpz_tstbit(CONFIG._cpumask, cpu_id)) { + if (thread_no == i) + return cpu_id; + i++; + } + } +#endif + return thread_no; +} +/*----------------------------------------------------------------------------*/ +int mtcp_core_affinitize(int cpu) +{ + cpu_set_t cpus; + size_t n; + int ret; + + n = GetNumCPUs(); + + cpu = whichCoreID(cpu); + + if (cpu < 0 || cpu >= (int)n) { + errno = -EINVAL; + return -1; + } + + CPU_ZERO(&cpus); + CPU_SET((unsigned)cpu, &cpus); + +#ifndef DISABLE_DPDK + return rte_thread_set_affinity(&cpus); +#else + struct bitmask *bmask; + FILE *fp; + char sysfname[MAX_FILE_NAME]; + int phy_id; + + ret = sched_setaffinity(Gettid(), sizeof(cpus), &cpus); + + if (numa_max_node() == 0) + return ret; + + bmask = numa_bitmask_alloc(numa_max_node() + 1); + assert(bmask); + + /* read physical id of the core from sys information */ + snprintf(sysfname, MAX_FILE_NAME - 1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu); + fp = fopen(sysfname, "r"); + if (!fp) { + perror(sysfname); + errno = EFAULT; + return -1; + } + ret = fscanf(fp, "%d", &phy_id); + if (ret != 1) { + fclose(fp); + perror("Fail to read core id"); + errno = EFAULT; + return -1; + } + + numa_bitmask_setbit(bmask, phy_id); + numa_set_membind(bmask); + numa_bitmask_free(bmask); + + fclose(fp); +#endif + return ret; +} diff --git a/lib/flash/mtcp/debug.c b/lib/flash/mtcp/debug.c new file mode 100644 index 0000000..ecf83e2 --- /dev/null +++ b/lib/flash/mtcp/debug.c @@ -0,0 +1,286 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include "debug.h" +#include "tcp_in.h" +#include "logger.h" + +/*----------------------------------------------------------------------------*/ +void flush_log_data(mtcp_manager_t mtcp) +{ + int ret = 0; + if (mtcp->w_buffer) { + EnqueueJobBuffer(mtcp->logger, mtcp->w_buffer); + ret = write(mtcp->sp_fd, "A", 1); + if (ret != 1) { + TRACE_INFO("Failed to flush logs in the buffer.\n"); + perror("write() for pipe"); + } + } +} +/*----------------------------------------------------------------------------*/ +void thread_printf(mtcp_manager_t mtcp, FILE *f_idx, const char *_Format, ...) +{ + va_list argptr; + va_start(argptr, _Format); + +#define PRINT_LIMIT 4096 + int len; + log_buff *wbuf; + + assert(f_idx != NULL); + + pthread_mutex_lock(&mtcp->logger->mutex); + wbuf = mtcp->w_buffer; + if (wbuf && (wbuf->buff_len + PRINT_LIMIT > LOG_BUFF_SIZE)) { + flush_log_data(mtcp); + wbuf = NULL; + } + + if (!wbuf) { + do { // out of free buffers!! + wbuf = DequeueFreeBuffer(mtcp->logger); + assert(wbuf); + } while (!wbuf); + wbuf->buff_len = 0; + wbuf->tid = mtcp->ctx->cpu; + wbuf->fid = f_idx; + mtcp->w_buffer = wbuf; + } + + len = vsnprintf(wbuf->buff + wbuf->buff_len, PRINT_LIMIT, _Format, argptr); + wbuf->buff_len += len; + pthread_mutex_unlock(&mtcp->logger->mutex); + + va_end(argptr); +} +/*----------------------------------------------------------------------------*/ +void DumpPacket(mtcp_manager_t mtcp, char *buf, int len, char *step, int ifindex) +{ + struct ethhdr *ethh; + struct iphdr *iph; + struct udphdr *udph; + struct tcphdr *tcph; + uint8_t *t; + + if (ifindex >= 0) + thread_printf(mtcp, mtcp->log_fp, "%s %d %u", step, ifindex, mtcp->cur_ts); + else + thread_printf(mtcp, mtcp->log_fp, "%s ? %u", step, mtcp->cur_ts); + + ethh = (struct ethhdr *)buf; + if (ntohs(ethh->h_proto) != ETH_P_IP) { + thread_printf(mtcp, mtcp->log_fp, "%02X:%02X:%02X:%02X:%02X:%02X -> %02X:%02X:%02X:%02X:%02X:%02X ", ethh->h_source[0], + ethh->h_source[1], ethh->h_source[2], ethh->h_source[3], ethh->h_source[4], ethh->h_source[5], + ethh->h_dest[0], ethh->h_dest[1], ethh->h_dest[2], ethh->h_dest[3], ethh->h_dest[4], ethh->h_dest[5]); + + thread_printf(mtcp, mtcp->log_fp, "protocol %04hx ", ntohs(ethh->h_proto)); + goto done; + } + + thread_printf(mtcp, mtcp->log_fp, " "); + + iph = (struct iphdr *)(ethh + 1); + udph = (struct udphdr *)((uint32_t *)iph + iph->ihl); + tcph = (struct tcphdr *)((uint32_t *)iph + iph->ihl); + + t = (uint8_t *)&iph->saddr; + thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->source)); + + thread_printf(mtcp, mtcp->log_fp, " -> "); + + t = (uint8_t *)&iph->daddr; + thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->dest)); + + thread_printf(mtcp, mtcp->log_fp, " IP_ID=%d", ntohs(iph->id)); + thread_printf(mtcp, mtcp->log_fp, " TTL=%d ", iph->ttl); + + if (ip_fast_csum(iph, iph->ihl)) { + __sum16 org_csum, correct_csum; + + org_csum = iph->check; + iph->check = 0; + correct_csum = ip_fast_csum(iph, iph->ihl); + thread_printf(mtcp, mtcp->log_fp, "(bad checksum %04x should be %04x) ", ntohs(org_csum), ntohs(correct_csum)); + iph->check = org_csum; + } + + switch (iph->protocol) { + case IPPROTO_TCP: + thread_printf(mtcp, mtcp->log_fp, "TCP "); + + if (tcph->syn) + thread_printf(mtcp, mtcp->log_fp, "S "); + if (tcph->fin) + thread_printf(mtcp, mtcp->log_fp, "F "); + if (tcph->ack) + thread_printf(mtcp, mtcp->log_fp, "A "); + if (tcph->rst) + thread_printf(mtcp, mtcp->log_fp, "R "); + + thread_printf(mtcp, mtcp->log_fp, "seq %u ", ntohl(tcph->seq)); + if (tcph->ack) + thread_printf(mtcp, mtcp->log_fp, "ack %u ", ntohl(tcph->ack_seq)); + thread_printf(mtcp, mtcp->log_fp, "WDW=%u ", ntohs(tcph->window)); + break; + case IPPROTO_UDP: + thread_printf(mtcp, mtcp->log_fp, "UDP "); + break; + default: + thread_printf(mtcp, mtcp->log_fp, "protocol %d ", iph->protocol); + goto done; + } +done: + thread_printf(mtcp, mtcp->log_fp, "len=%d\n", len); +} +/*----------------------------------------------------------------------------*/ +void DumpIPPacket(mtcp_manager_t mtcp, const struct iphdr *iph, int len) +{ + const struct udphdr *udph; + const struct tcphdr *tcph; + const uint8_t *t; + + udph = (const struct udphdr *)((const uint32_t *)iph + iph->ihl); + tcph = (const struct tcphdr *)((const uint32_t *)iph + iph->ihl); + + t = (const uint8_t *)&iph->saddr; + thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->source)); + + thread_printf(mtcp, mtcp->log_fp, " -> "); + + t = (const uint8_t *)&iph->daddr; + thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->dest)); + + thread_printf(mtcp, mtcp->log_fp, " IP_ID=%d", ntohs(iph->id)); + thread_printf(mtcp, mtcp->log_fp, " TTL=%d ", iph->ttl); + + if (ip_fast_csum(iph, iph->ihl)) { + thread_printf(mtcp, mtcp->log_fp, "(bad checksum) "); + } + + switch (iph->protocol) { + case IPPROTO_TCP: + thread_printf(mtcp, mtcp->log_fp, "TCP "); + + if (tcph->syn) + thread_printf(mtcp, mtcp->log_fp, "S "); + if (tcph->fin) + thread_printf(mtcp, mtcp->log_fp, "F "); + if (tcph->ack) + thread_printf(mtcp, mtcp->log_fp, "A "); + if (tcph->rst) + thread_printf(mtcp, mtcp->log_fp, "R "); + + thread_printf(mtcp, mtcp->log_fp, "seq %u ", ntohl(tcph->seq)); + if (tcph->ack) + thread_printf(mtcp, mtcp->log_fp, "ack %u ", ntohl(tcph->ack_seq)); + thread_printf(mtcp, mtcp->log_fp, "WDW=%u ", ntohs(tcph->window)); + break; + case IPPROTO_UDP: + thread_printf(mtcp, mtcp->log_fp, "UDP "); + break; + default: + thread_printf(mtcp, mtcp->log_fp, "protocol %d ", iph->protocol); + goto done; + } +done: + thread_printf(mtcp, mtcp->log_fp, "len=%d\n", len); +} +/*----------------------------------------------------------------------------*/ +void DumpIPPacketToFile(FILE *fout, const struct iphdr *iph, int len) +{ + const struct udphdr *udph; + const struct tcphdr *tcph; + const uint8_t *t; + + udph = (const struct udphdr *)((const uint32_t *)iph + iph->ihl); + tcph = (const struct tcphdr *)((const uint32_t *)iph + iph->ihl); + + t = (const uint8_t *)&iph->saddr; + fprintf(fout, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + fprintf(fout, "(%d)", ntohs(udph->source)); + + fprintf(fout, " -> "); + + t = (const uint8_t *)&iph->daddr; + fprintf(fout, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]); + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + fprintf(fout, "(%d)", ntohs(udph->dest)); + + fprintf(fout, " IP_ID=%d", ntohs(iph->id)); + fprintf(fout, " TTL=%d ", iph->ttl); + + if (ip_fast_csum(iph, iph->ihl)) { + fprintf(fout, "(bad checksum) "); + } + + switch (iph->protocol) { + case IPPROTO_TCP: + fprintf(fout, "TCP "); + + if (tcph->syn) + fprintf(fout, "S "); + if (tcph->fin) + fprintf(fout, "F "); + if (tcph->ack) + fprintf(fout, "A "); + if (tcph->rst) + fprintf(fout, "R "); + + fprintf(fout, "seq %u ", ntohl(tcph->seq)); + if (tcph->ack) + fprintf(fout, "ack %u ", ntohl(tcph->ack_seq)); + fprintf(fout, "WDW=%u ", ntohs(tcph->window)); + break; + case IPPROTO_UDP: + fprintf(fout, "UDP "); + break; + default: + fprintf(fout, "protocol %d ", iph->protocol); + goto done; + } +done: + fprintf(fout, "len=%d\n", len); +} diff --git a/lib/flash/mtcp/dpdk_module.c b/lib/flash/mtcp/dpdk_module.c new file mode 100644 index 0000000..3b9ad09 --- /dev/null +++ b/lib/flash/mtcp/dpdk_module.c @@ -0,0 +1,942 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* for io_module_func def'ns */ +#include "io_module.h" +#ifndef DISABLE_DPDK +/* for mtcp related def'ns */ +#include "mtcp.h" +/* for errno */ +#include +/* for logging */ +#include "debug.h" +/* for num_devices_* */ +#include "config.h" +/* for rte_max_eth_ports */ +#include +/* for rte_eth_rxconf */ +#include +/* for delay funcs */ +#include +#include +#define ENABLE_STATS_IOCTL 1 +#ifdef ENABLE_STATS_IOCTL +/* for close */ +#include +/* for open */ +#include +/* for ioctl */ +#include +#endif /* !ENABLE_STATS_IOCTL */ +/* for ip pseudo-chksum */ +#include +//#define IP_DEFRAG 1 +#ifdef IP_DEFRAG +/* for ip defragging */ +#include +#endif +/* for ioctl funcs */ +#include +/* for retrieving rte version(s) */ +#include +/*----------------------------------------------------------------------------*/ +/* Essential macros */ +#define MAX_RX_QUEUE_PER_LCORE MAX_CPUS +#define MAX_TX_QUEUE_PER_PORT MAX_CPUS + +#ifdef ENABLELRO +#define BUF_SIZE 16384 +#else +#define BUF_SIZE 2048 +#endif /* !ENABLELRO */ +#define MBUF_SIZE (BUF_SIZE + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) +#define NB_MBUF 8192 +#define MEMPOOL_CACHE_SIZE 256 +#ifdef ENFORCE_RX_IDLE +#define RX_IDLE_ENABLE 1 +#define RX_IDLE_TIMEOUT 1 /* in micro-seconds */ +#endif + +/* + * RX and TX Prefetch, Host, and Write-back threshold values should be + * carefully set for optimal performance. Consult the network + * controller's datasheet and supporting DPDK documentation for guidance + * on how these parameters should be set. + */ +#define RX_PTHRESH 8 /**< Default values of RX prefetch threshold reg. */ +#define RX_HTHRESH 8 /**< Default values of RX host threshold reg. */ +#define RX_WTHRESH 4 /**< Default values of RX write-back threshold reg. */ + +/* + * These default values are optimized for use with the Intel(R) 82599 10 GbE + * Controller and the DPDK ixgbe PMD. Consider using other values for other + * network controllers and/or network drivers. + */ +#define TX_PTHRESH 36 /**< Default values of TX prefetch threshold reg. */ +#define TX_HTHRESH 0 /**< Default values of TX host threshold reg. */ +#define TX_WTHRESH 0 /**< Default values of TX write-back threshold reg. */ + +#define MAX_PKT_BURST 64 /*128*/ + +/* + * Configurable number of RX/TX ring descriptors + */ +#define RTE_TEST_RX_DESC_DEFAULT 128 +#define RTE_TEST_TX_DESC_DEFAULT 128 + +/* + * Ethernet frame overhead + */ + +#define ETHER_IFG 12 +#define ETHER_PREAMBLE 8 +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) +#define ETHER_OVR (ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG) +#else +#define ETHER_OVR (RTE_ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG) +#endif + +static const uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; +static const uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; +/*----------------------------------------------------------------------------*/ +/* packet memory pools for storing packet bufs */ +static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = { NULL }; + +//#define DEBUG 1 +#ifdef DEBUG +/* ethernet addresses of ports */ +static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; +#endif + +static struct rte_eth_dev_info dev_info[RTE_MAX_ETHPORTS]; + +static struct rte_eth_conf port_conf = { + .rxmode = { + .mq_mode = ETH_MQ_RX_RSS, +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + .max_rx_pkt_len = ETHER_MAX_LEN, +#else + .max_rx_pkt_len = RTE_ETHER_MAX_LEN, +#endif +#if RTE_VERSION > RTE_VERSION_NUM(17, 8, 0, 0) + .offloads = ( +#if RTE_VERSION < RTE_VERSION_NUM(18, 5, 0, 0) + DEV_RX_OFFLOAD_CRC_STRIP | +#endif /* !18.05 */ + DEV_RX_OFFLOAD_CHECKSUM +#ifdef ENABLELRO + | DEV_RX_OFFLOAD_TCP_LRO +#endif + ), +#endif /* !17.08 */ + .split_hdr_size = 0, +#if RTE_VERSION < RTE_VERSION_NUM(18, 5, 0, 0) + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 1, /**< IP checksum offload enabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ + .hw_strip_crc = 1, /**< CRC stripped by hardware */ +#endif /* !18.05 */ +#ifdef ENABLELRO + .enable_lro = 1, /**< Enable LRO */ +#endif + }, + .rx_adv_conf = { + .rss_conf = { + .rss_key = NULL, + .rss_hf = ETH_RSS_TCP | ETH_RSS_UDP | + ETH_RSS_IP | ETH_RSS_L2_PAYLOAD + }, + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, +#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) + .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM) +#endif + }, +}; + +static const struct rte_eth_rxconf rx_conf = { + .rx_thresh = { + .pthresh = RX_PTHRESH, /* RX prefetch threshold reg */ + .hthresh = RX_HTHRESH, /* RX host threshold reg */ + .wthresh = RX_WTHRESH, /* RX write-back threshold reg */ + }, + .rx_free_thresh = 32, +}; + +static const struct rte_eth_txconf tx_conf = { + .tx_thresh = { + .pthresh = TX_PTHRESH, /* TX prefetch threshold reg */ + .hthresh = TX_HTHRESH, /* TX host threshold reg */ + .wthresh = TX_WTHRESH, /* TX write-back threshold reg */ + }, + .tx_free_thresh = 0, /* Use PMD default values */ + .tx_rs_thresh = 0, /* Use PMD default values */ +#if RTE_VERSION < RTE_VERSION_NUM(18, 5, 0, 0) + /* + * As the example won't handle mult-segments and offload cases, + * set the flag by default. + */ + .txq_flags = 0x0, +#endif +}; + +struct mbuf_table { + uint16_t len; /* length of queued packets */ + struct rte_mbuf *m_table[MAX_PKT_BURST]; +}; + +struct dpdk_private_context { + struct mbuf_table rmbufs[RTE_MAX_ETHPORTS]; + struct mbuf_table wmbufs[RTE_MAX_ETHPORTS]; + struct rte_mempool *pktmbuf_pool; + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; +#ifdef RX_IDLE_ENABLE + uint8_t rx_idle; +#endif +#ifdef IP_DEFRAG + struct rte_ip_frag_tbl *frag_tbl; + struct rte_ip_frag_death_row death_row; +#endif +#ifdef ENABLELRO + struct rte_mbuf *cur_rx_m; +#endif +#ifdef ENABLE_STATS_IOCTL + int fd; + uint32_t cur_ts; +#endif /* !ENABLE_STATS_IOCTL */ +} __rte_cache_aligned; + +#ifdef ENABLE_STATS_IOCTL +/** + * stats struct passed on from user space to the driver + */ +struct stats_struct { + uint64_t tx_bytes; + uint64_t tx_pkts; + uint64_t rx_bytes; + uint64_t rx_pkts; + uint64_t rmiss; + uint64_t rerr; + uint64_t terr; + uint8_t qid; + uint8_t dev; +}; +#endif /* !ENABLE_STATS_IOCTL */ + +#ifdef IP_DEFRAG +/* Should be power of two. */ +#define IP_FRAG_TBL_BUCKET_ENTRIES 16 +#define RTE_LOGTYPE_IP_RSMBL RTE_LOGTYPE_USER1 +#define MAX_FRAG_NUM RTE_LIBRTE_IP_FRAG_MAX_FRAG +#endif /* !IP_DEFRAG */ +/*----------------------------------------------------------------------------*/ +void dpdk_init_handle(struct mtcp_thread_context *ctxt) +{ + struct dpdk_private_context *dpc; + int i, j; + char mempool_name[RTE_MEMPOOL_NAMESIZE]; + + /* create and initialize private I/O module context */ + ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context)); + if (ctxt->io_private_context == NULL) { + TRACE_ERROR("Failed to initialize ctxt->io_private_context: " + "Can't allocate memory\n"); + exit(EXIT_FAILURE); + } + + sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu); + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu]; + + /* set wmbufs correctly */ + for (j = 0; j < num_devices_attached; j++) { + /* Allocate wmbufs for each registered port */ + for (i = 0; i < MAX_PKT_BURST; i++) { + dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]); + if (dpc->wmbufs[j].m_table[i] == NULL) { + TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n", ctxt->cpu, i, j); + exit(EXIT_FAILURE); + } + } + /* set mbufs queue length to 0 to begin with */ + dpc->wmbufs[j].len = 0; + } + +#ifdef IP_DEFRAG + int max_flows; + int socket; + uint64_t frag_cycles; + + max_flows = CONFIG.max_concurrency / CONFIG.num_cores; + frag_cycles = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S * max_flows; + socket = rte_lcore_to_socket_id(ctxt->cpu); + + if ((dpc->frag_tbl = rte_ip_frag_table_create(max_flows, IP_FRAG_TBL_BUCKET_ENTRIES, max_flows, frag_cycles, socket)) == + NULL) { + RTE_LOG(ERR, IP_RSMBL, + "ip_frag_tbl_create(%u) on " + "lcore: %u for queue: %u failed\n", + max_flows, ctxt->cpu, ctxt->cpu); + exit(EXIT_FAILURE); + } +#endif /* !IP_DEFRAG */ + +#ifdef ENABLE_STATS_IOCTL + dpc->fd = open(DEV_PATH, O_RDWR); + if (dpc->fd == -1) { + TRACE_ERROR("Can't open " DEV_PATH " for context->cpu: %d! " + "Are you using mlx4/mlx5 driver?\n", + ctxt->cpu); + } +#endif /* !ENABLE_STATS_IOCTL */ +} +/*----------------------------------------------------------------------------*/ +int dpdk_link_devices(struct mtcp_thread_context *ctxt) +{ + /* linking takes place during mtcp_init() */ + + return 0; +} +/*----------------------------------------------------------------------------*/ +void dpdk_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len) +{ + /* + * do nothing over here - memory reclamation + * will take place in dpdk_recv_pkts + */ +} +/*----------------------------------------------------------------------------*/ +int dpdk_send_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + struct dpdk_private_context *dpc; +#ifdef NETSTAT + mtcp_manager_t mtcp; +#endif + int ret, i, portid = CONFIG.eths[ifidx].ifindex; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; +#ifdef NETSTAT + mtcp = ctxt->mtcp_manager; +#endif + ret = 0; + + /* if there are packets in the queue... flush them out to the wire */ + if (dpc->wmbufs[ifidx].len > /*= MAX_PKT_BURST*/ 0) { + struct rte_mbuf **pkts; +#ifdef ENABLE_STATS_IOCTL +#ifdef NETSTAT + struct rte_eth_stats stats; + struct stats_struct ss; +#endif +#endif /* !ENABLE_STATS_IOCTL */ + int cnt = dpc->wmbufs[ifidx].len; + pkts = dpc->wmbufs[ifidx].m_table; +#ifdef NETSTAT + mtcp->nstat.tx_packets[ifidx] += cnt; +#ifdef ENABLE_STATS_IOCTL + /* only pass stats after >= 1 sec interval */ + if (abs(mtcp->cur_ts - dpc->cur_ts) >= 1000 && likely(dpc->fd >= 0)) { + /* rte_get_stats is global func, use only for 1 core */ + if (ctxt->cpu == 0) { + rte_eth_stats_get(portid, &stats); + ss.rmiss = stats.imissed; + ss.rerr = stats.ierrors; + ss.terr = stats.oerrors; + } else + ss.rmiss = ss.rerr = ss.terr = 0; + + ss.tx_pkts = mtcp->nstat.tx_packets[ifidx]; + ss.tx_bytes = mtcp->nstat.tx_bytes[ifidx]; + ss.rx_pkts = mtcp->nstat.rx_packets[ifidx]; + ss.rx_bytes = mtcp->nstat.rx_bytes[ifidx]; + ss.qid = ctxt->cpu; + ss.dev = portid; + /* pass the info now */ + if (ioctl(dpc->fd, SEND_STATS, &ss) == -1) + TRACE_ERROR("Can't update iface stats!\n"); + dpc->cur_ts = mtcp->cur_ts; + if (ctxt->cpu == 0) + rte_eth_stats_reset(portid); + } +#endif /* !ENABLE_STATS_IOCTL */ +#endif + do { + /* tx cnt # of packets */ + ret = rte_eth_tx_burst(portid, ctxt->cpu, pkts, cnt); + pkts += ret; + cnt -= ret; + /* if not all pkts were sent... then repeat the cycle */ + } while (cnt > 0); + + /* time to allocate fresh mbufs for the queue */ + for (i = 0; i < dpc->wmbufs[ifidx].len; i++) { + dpc->wmbufs[ifidx].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]); + /* error checking */ + if (unlikely(dpc->wmbufs[ifidx].m_table[i] == NULL)) { + TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n", ctxt->cpu, i, ifidx); + exit(EXIT_FAILURE); + } + } + /* reset the len of mbufs var after flushing of packets */ + dpc->wmbufs[ifidx].len = 0; + } + + return ret; +} +/*----------------------------------------------------------------------------*/ +uint8_t *dpdk_get_wptr(struct mtcp_thread_context *ctxt, int ifidx, uint16_t pktsize) +{ + struct dpdk_private_context *dpc; +#ifdef NETSTAT + mtcp_manager_t mtcp; +#endif + struct rte_mbuf *m; + uint8_t *ptr; + int len_of_mbuf; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; +#ifdef NETSTAT + mtcp = ctxt->mtcp_manager; +#endif + + /* sanity check */ + if (unlikely(dpc->wmbufs[ifidx].len == MAX_PKT_BURST)) + return NULL; + + len_of_mbuf = dpc->wmbufs[ifidx].len; + m = dpc->wmbufs[ifidx].m_table[len_of_mbuf]; + + /* retrieve the right write offset */ + ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *); + m->pkt_len = m->data_len = pktsize; + m->nb_segs = 1; + m->next = NULL; + +#ifdef NETSTAT + mtcp->nstat.tx_bytes[ifidx] += pktsize + ETHER_OVR; +#endif + + /* increment the len_of_mbuf var */ + dpc->wmbufs[ifidx].len = len_of_mbuf + 1; + + return (uint8_t *)ptr; +} +/*----------------------------------------------------------------------------*/ +static inline void free_pkts(struct rte_mbuf **mtable, unsigned len) +{ + int i; + + /* free the freaking packets */ + for (i = 0; i < len; i++) { + rte_pktmbuf_free(mtable[i]); + RTE_MBUF_PREFETCH_TO_FREE(mtable[i + 1]); + } +} +/*----------------------------------------------------------------------------*/ +int32_t dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + struct dpdk_private_context *dpc; + int ret; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + if (dpc->rmbufs[ifidx].len != 0) { + free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len); + dpc->rmbufs[ifidx].len = 0; + } + + int portid = CONFIG.eths[ifidx].ifindex; + ret = rte_eth_rx_burst((uint8_t)portid, ctxt->cpu, dpc->pkts_burst, MAX_PKT_BURST); +#ifdef RX_IDLE_ENABLE + dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1; +#endif + dpc->rmbufs[ifidx].len = ret; + + return ret; +} +/*----------------------------------------------------------------------------*/ +#ifdef IP_DEFRAG +struct rte_mbuf *ip_reassemble(struct dpdk_private_context *dpc, struct rte_mbuf *m) +{ + struct ether_hdr *eth_hdr; + struct rte_ip_frag_tbl *tbl; + struct rte_ip_frag_death_row *dr; + + /* if packet is IPv4 */ + if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { + struct ipv4_hdr *ip_hdr; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + ip_hdr = (struct ipv4_hdr *)(eth_hdr + 1); + + /* if it is a fragmented packet, then try to reassemble. */ + if (rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) { + struct rte_mbuf *mo; + + tbl = dpc->frag_tbl; + dr = &dpc->death_row; + + /* prepare mbuf: setup l2_len/l3_len. */ + m->l2_len = sizeof(*eth_hdr); + m->l3_len = sizeof(*ip_hdr); + + /* process this fragment. */ + mo = rte_ipv4_frag_reassemble_packet(tbl, dr, m, rte_rdtsc(), ip_hdr); + if (mo == NULL) + /* no packet to send out. */ + return NULL; + + /* we have our packet reassembled. */ + if (mo != m) + m = mo; + } + } + + /* if packet isn't IPv4, just accept it! */ + return m; +} +#endif +/*----------------------------------------------------------------------------*/ +uint8_t *dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len) +{ + struct dpdk_private_context *dpc; + struct rte_mbuf *m; + uint8_t *pktbuf; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + m = dpc->pkts_burst[index]; +#ifdef IP_DEFRAG + m = ip_reassemble(dpc, m); +#endif + *len = m->pkt_len; + pktbuf = rte_pktmbuf_mtod(m, uint8_t *); + + /* enqueue the pkt ptr in mbuf */ + dpc->rmbufs[ifidx].m_table[index] = m; + + /* verify checksum values from ol_flags */ + if ((m->ol_flags & (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD)) != 0) { + TRACE_ERROR("%s(%p, %d, %d): mbuf with invalid checksum: " + "%p(%lu);\n", + __func__, ctxt, ifidx, index, m, m->ol_flags); + pktbuf = NULL; + } +#ifdef ENABLELRO + dpc->cur_rx_m = m; +#endif /* ENABLELRO */ + + return pktbuf; +} +/*----------------------------------------------------------------------------*/ +int32_t dpdk_select(struct mtcp_thread_context *ctxt) +{ +#ifdef RX_IDLE_ENABLE + struct dpdk_private_context *dpc; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + if (dpc->rx_idle > RX_IDLE_THRESH) { + dpc->rx_idle = 0; + usleep(RX_IDLE_TIMEOUT); + } +#endif + return 0; +} +/*----------------------------------------------------------------------------*/ +void dpdk_destroy_handle(struct mtcp_thread_context *ctxt) +{ + struct dpdk_private_context *dpc; + int i; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + /* free wmbufs */ + for (i = 0; i < num_devices_attached; i++) + free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST); + +#ifdef ENABLE_STATS_IOCTL + /* free fd */ + if (dpc->fd >= 0) + close(dpc->fd); +#endif /* !ENABLE_STATS_IOCTL */ + + /* free it all up */ + free(dpc); +} +/*----------------------------------------------------------------------------*/ +static void check_all_ports_link_status(uint8_t port_num, uint32_t port_mask) +{ +#define CHECK_INTERVAL 100 /* 100ms */ +#define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ + + uint8_t portid, count, all_ports_up, print_flag = 0; + struct rte_eth_link link; + + printf("\nChecking link status"); + fflush(stdout); + for (count = 0; count <= MAX_CHECK_TIME; count++) { + all_ports_up = 1; + for (portid = 0; portid < port_num; portid++) { + if ((port_mask & (1 << portid)) == 0) + continue; + memset(&link, 0, sizeof(link)); + rte_eth_link_get_nowait(portid, &link); + /* print link status if flag set */ + if (print_flag == 1) { + if (link.link_status) + printf("Port %d Link Up - speed %u " + "Mbps - %s\n", + (uint8_t)portid, (unsigned)link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n")); + else + printf("Port %d Link Down\n", (uint8_t)portid); + continue; + } + /* clear all_ports_up flag if any link down */ + if (link.link_status == 0) { + all_ports_up = 0; + break; + } + } + /* after finally printing all link status, get out */ + if (print_flag == 1) + break; + + if (all_ports_up == 0) { + printf("."); + fflush(stdout); + rte_delay_ms(CHECK_INTERVAL); + } + + /* set the print_flag if all ports up or timeout */ + if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { + print_flag = 1; + printf("done\n"); + } + } +} +/*----------------------------------------------------------------------------*/ +void dpdk_load_module(void) +{ + int portid, rxlcore_id, ret; + /* for Ethernet flow control settings */ + struct rte_eth_fc_conf fc_conf; + /* setting the rss key */ + static uint8_t key[] = { + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 10 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 20 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 30 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 40 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 50 */ + 0x05, 0x05 /* 60 - 8 */ + }; + + port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)key; + port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key); + + if (!CONFIG.multi_process || (CONFIG.multi_process && CONFIG.multi_process_is_master)) { + for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) { + char name[RTE_MEMPOOL_NAMESIZE]; + uint32_t nb_mbuf; + sprintf(name, "mbuf_pool-%d", rxlcore_id); + nb_mbuf = NB_MBUF; +#ifdef IP_DEFRAG + int max_flows; + max_flows = CONFIG.max_concurrency / CONFIG.num_cores; + + /* + * At any given moment up to + * mbufs could be stored int the fragment table. + * Plus, each TX queue can hold up to packets. + */ + + nb_mbuf = RTE_MAX(max_flows, 2UL * MAX_PKT_BURST) * MAX_FRAG_NUM; + nb_mbuf *= (port_conf.rxmode.max_rx_pkt_len + BUF_SIZE - 1) / BUF_SIZE; + nb_mbuf += RTE_TEST_RX_DESC_DEFAULT + RTE_TEST_TX_DESC_DEFAULT; + + nb_mbuf = RTE_MAX(nb_mbuf, (uint32_t)NB_MBUF); +#endif + /* create the mbuf pools */ + pktmbuf_pool[rxlcore_id] = rte_mempool_create(name, nb_mbuf, MBUF_SIZE, MEMPOOL_CACHE_SIZE, + sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, + NULL, rte_pktmbuf_init, NULL, rte_socket_id(), + MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET); + + if (pktmbuf_pool[rxlcore_id] == NULL) + rte_exit(EXIT_FAILURE, "Cannot init mbuf pool, errno: %d\n", rte_errno); + } + + /* Initialise each port */ + int i; + for (i = 0; i < num_devices_attached; ++i) { + /* get portid form the index of attached devices */ + portid = devices_attached[i]; + + /* check port capabilities */ + rte_eth_dev_info_get(portid, &dev_info[portid]); +#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) + /* re-adjust rss_hf */ + port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info[portid].flow_type_rss_offloads; +#endif + /* init port */ + printf("Initializing port %u... ", (unsigned)portid); + fflush(stdout); + if (!strncmp(dev_info[portid].driver_name, "net_mlx", 7)) + port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; + + ret = rte_eth_dev_configure(portid, CONFIG.num_cores, CONFIG.num_cores, &port_conf); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u, cores: %d\n", ret, (unsigned)portid, + CONFIG.num_cores); + + /* init one RX queue per CPU */ + fflush(stdout); +#ifdef DEBUG + rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); +#endif + + for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) { + ret = rte_eth_rx_queue_setup(portid, rxlcore_id, nb_rxd, rte_eth_dev_socket_id(portid), &rx_conf, + pktmbuf_pool[rxlcore_id]); + if (ret < 0) + rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:err=%d, port=%u, queueid: %d\n", ret, + (unsigned)portid, rxlcore_id); + } + + /* init one TX queue on each port per CPU (this is redundant for this app) */ + fflush(stdout); + for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) { + ret = rte_eth_tx_queue_setup(portid, rxlcore_id, nb_txd, rte_eth_dev_socket_id(portid), &tx_conf); + if (ret < 0) + rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:err=%d, port=%u, queueid: %d\n", ret, + (unsigned)portid, rxlcore_id); + } + + /* Start device */ + ret = rte_eth_dev_start(portid); + if (ret < 0) + rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n", ret, (unsigned)portid); + + printf("done: \n"); + rte_eth_promiscuous_enable(portid); + + /* retrieve current flow control settings per port */ + memset(&fc_conf, 0, sizeof(fc_conf)); + ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf); + if (ret != 0) + TRACE_INFO("Failed to get flow control info!\n"); + + /* and just disable the rx/tx flow control */ + fc_conf.mode = RTE_FC_NONE; + ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf); + if (ret != 0) + TRACE_INFO("Failed to set flow control info!: errno: %d\n", ret); + +#ifdef DEBUG + printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n", (unsigned)portid, + ports_eth_addr[portid].addr_bytes[0], ports_eth_addr[portid].addr_bytes[1], + ports_eth_addr[portid].addr_bytes[2], ports_eth_addr[portid].addr_bytes[3], + ports_eth_addr[portid].addr_bytes[4], ports_eth_addr[portid].addr_bytes[5]); +#endif + } + /* only check for link status if the thread is master */ + check_all_ports_link_status(num_devices_attached, 0xFFFFFFFF); + } else { /* CONFIG.multi_process && !CONFIG.multi_process_is_master */ + for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) { + char name[RTE_MEMPOOL_NAMESIZE]; + sprintf(name, "mbuf_pool-%d", rxlcore_id); + /* initialize the mbuf pools */ + pktmbuf_pool[rxlcore_id] = rte_mempool_lookup(name); + if (pktmbuf_pool[rxlcore_id] == NULL) + rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n"); + } + + int i; + /* initializing dev_info struct */ + for (i = 0; i < num_devices_attached; i++) { + /* get portid form the index of attached devices */ + portid = devices_attached[i]; + /* check port capabilities */ + rte_eth_dev_info_get(i, &dev_info[portid]); + } + } +} +/*----------------------------------------------------------------------------*/ +int32_t dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp) +{ + struct dpdk_private_context *dpc; + struct rte_mbuf *m; + int len_of_mbuf; + struct iphdr *iph; + struct tcphdr *tcph; + void **argpptr = (void **)argp; +#ifdef ENABLELRO + uint8_t *payload, *to; + int seg_off; +#endif + + if (cmd == DRV_NAME) { + *argpptr = (void *)dev_info[nif].driver_name; + return 0; + } + + int eidx = CONFIG.nif_to_eidx[nif]; + + iph = (struct iphdr *)argp; + dpc = (struct dpdk_private_context *)ctx->io_private_context; + len_of_mbuf = dpc->wmbufs[eidx].len; + + switch (cmd) { + case PKT_TX_IP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; + m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4; +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + m->l2_len = sizeof(struct ether_hdr); +#else + m->l2_len = sizeof(struct rte_ether_hdr); +#endif + m->l3_len = (iph->ihl << 2); + break; + case PKT_TX_TCP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; + tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl << 2)); + m->ol_flags |= PKT_TX_TCP_CKSUM; +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags); +#else + tcph->check = rte_ipv4_phdr_cksum((struct rte_ipv4_hdr *)iph, m->ol_flags); +#endif + break; +#ifdef ENABLELRO + case PKT_RX_TCP_LROSEG: + m = dpc->cur_rx_m; + //if (m->next != NULL) + // rte_prefetch0(rte_pktmbuf_mtod(m->next, void *)); + iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr)); + tcph = (struct tcphdr *)((u_char *)iph + (iph->ihl << 2)); + payload = (uint8_t *)tcph + (tcph->doff << 2); + + seg_off = m->data_len - sizeof(struct ether_hdr) - (iph->ihl << 2) - (tcph->doff << 2); + + to = (uint8_t *)argp; + m = m->next; + memcpy(to, payload, seg_off); + while (m != NULL) { + //if (m->next != NULL) + // rte_prefetch0(rte_pktmbuf_mtod(m->next, void *)); + memcpy(to + seg_off, rte_pktmbuf_mtod(m, uint8_t *), m->data_len); + seg_off += m->data_len; + m = m->next; + } + break; +#endif + case PKT_TX_TCPIP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr)); +#else + iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct rte_ether_hdr)); +#endif + tcph = (struct tcphdr *)((uint8_t *)iph + (iph->ihl << 2)); +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + m->l2_len = sizeof(struct ether_hdr); +#else + m->l2_len = sizeof(struct rte_ether_hdr); +#endif + m->l3_len = (iph->ihl << 2); + m->l4_len = (tcph->doff << 2); + m->ol_flags = PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4; +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags); +#else + tcph->check = rte_ipv4_phdr_cksum((struct rte_ipv4_hdr *)iph, m->ol_flags); +#endif + break; + case PKT_RX_IP_CSUM: + if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + break; + case PKT_RX_TCP_CSUM: + if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + break; + case PKT_TX_TCPIP_CSUM_PEEK: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + break; + default: + goto dev_ioctl_err; + } + return 0; +dev_ioctl_err: + return -1; +} +/*----------------------------------------------------------------------------*/ +io_module_func dpdk_module_func = { .load_module = dpdk_load_module, + .init_handle = dpdk_init_handle, + .link_devices = dpdk_link_devices, + .release_pkt = dpdk_release_pkt, + .send_pkts = dpdk_send_pkts, + .get_wptr = dpdk_get_wptr, + .recv_pkts = dpdk_recv_pkts, + .get_rptr = dpdk_get_rptr, + .select = dpdk_select, + .destroy_handle = dpdk_destroy_handle, + .dev_ioctl = dpdk_dev_ioctl }; +/*----------------------------------------------------------------------------*/ +#else +io_module_func dpdk_module_func = { .load_module = NULL, + .init_handle = NULL, + .link_devices = NULL, + .release_pkt = NULL, + .send_pkts = NULL, + .get_wptr = NULL, + .recv_pkts = NULL, + .get_rptr = NULL, + .select = NULL, + .destroy_handle = NULL, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#endif /* !DISABLE_DPDK */ diff --git a/lib/flash/mtcp/eth_in.c b/lib/flash/mtcp/eth_in.c new file mode 100644 index 0000000..2c4c028 --- /dev/null +++ b/lib/flash/mtcp/eth_in.c @@ -0,0 +1,84 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ps.h" +#include "ip_in.h" +#include "eth_in.h" +#include "arp.h" +#include "debug.h" + +/*----------------------------------------------------------------------------*/ +int ProcessPacket(mtcp_manager_t mtcp, const int ifidx, uint32_t cur_ts, unsigned char *pkt_data, int len) +{ + struct ethhdr *ethh = (struct ethhdr *)pkt_data; + u_short ip_proto = ntohs(ethh->h_proto); + int ret; + +#ifdef PKTDUMP + DumpPacket(mtcp, (char *)pkt_data, len, "IN", ifidx); +#endif + +#ifdef NETSTAT + mtcp->nstat.rx_packets[ifidx]++; + mtcp->nstat.rx_bytes[ifidx] += len + 24; +#endif /* NETSTAT */ + +#if 0 + /* ignore mac address which is not for current interface */ + int i; + for (i = 0; i < 6; i ++) { + if (ethh->h_dest[i] != CONFIG.eths[ifidx].haddr[i]) { + return FALSE; + } + } +#endif + + if (ip_proto == ETH_P_IP) { + /* process ipv4 packet */ + ret = ProcessIPv4Packet(mtcp, cur_ts, ifidx, pkt_data, len); + + } else if (ip_proto == ETH_P_ARP) { + ProcessARPPacket(mtcp, cur_ts, ifidx, pkt_data, len); + return TRUE; + + } else { + //DumpPacket(mtcp, (char *)pkt_data, len, "??", ifidx); + return FALSE; + } + +#ifdef NETSTAT + if (ret < 0) { + mtcp->nstat.rx_errors[ifidx]++; + } +#endif + + return ret; +} diff --git a/lib/flash/mtcp/eth_out.c b/lib/flash/mtcp/eth_out.c new file mode 100644 index 0000000..196450a --- /dev/null +++ b/lib/flash/mtcp/eth_out.c @@ -0,0 +1,110 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "mtcp.h" +#include "arp.h" +#include "eth_out.h" +#include "debug.h" + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +#ifndef ERROR +#define ERROR (-1) +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define MAX_WINDOW_SIZE 65535 + +/*----------------------------------------------------------------------------*/ +uint8_t *EthernetOutput(struct mtcp_manager *mtcp, uint16_t h_proto, int nif, unsigned char *dst_haddr, uint16_t iplen) +{ + uint8_t *buf; + struct ethhdr *ethh; + int i, eidx; + + /* + * -sanity check- + * return early if no interface is set (if routing entry does not exist) + */ + if (nif < 0) { + TRACE_INFO("No interface set!\n"); + return NULL; + } + + eidx = CONFIG.nif_to_eidx[nif]; + if (eidx < 0) { + TRACE_INFO("No interface selected!\n"); + return NULL; + } + + buf = mtcp->iom->get_wptr(mtcp->ctx, eidx, iplen + ETHERNET_HEADER_LEN); + if (!buf) { + //TRACE_DBG("Failed to get available write buffer\n"); + return NULL; + } + //memset(buf, 0, ETHERNET_HEADER_LEN + iplen); + +#if 0 + TRACE_DBG("dst_hwaddr: %02X:%02X:%02X:%02X:%02X:%02X\n", + dst_haddr[0], dst_haddr[1], + dst_haddr[2], dst_haddr[3], + dst_haddr[4], dst_haddr[5]); +#endif + + ethh = (struct ethhdr *)buf; + for (i = 0; i < ETH_ALEN; i++) { + ethh->h_source[i] = CONFIG.eths[eidx].haddr[i]; + ethh->h_dest[i] = dst_haddr[i]; + } + ethh->h_proto = htons(h_proto); + + return (uint8_t *)(ethh + 1); +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/eventpoll.c b/lib/flash/mtcp/eventpoll.c new file mode 100644 index 0000000..1dfd072 --- /dev/null +++ b/lib/flash/mtcp/eventpoll.c @@ -0,0 +1,633 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "mtcp.h" +#include "tcp_stream.h" +#include "eventpoll.h" +#include "tcp_in.h" +#include "pipe.h" +#include "debug.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define SPIN_BEFORE_SLEEP FALSE +#define SPIN_THRESH 10000000 + +/*----------------------------------------------------------------------------*/ +const char *event_str[] = { "NONE", "IN", "PRI", "OUT", "ERR", "HUP", "RDHUP" }; +/*----------------------------------------------------------------------------*/ +const char *EventToString(uint32_t event) +{ + switch (event) { + case MTCP_EPOLLNONE: + return event_str[0]; + break; + case MTCP_EPOLLIN: + return event_str[1]; + break; + case MTCP_EPOLLPRI: + return event_str[2]; + break; + case MTCP_EPOLLOUT: + return event_str[3]; + break; + case MTCP_EPOLLERR: + return event_str[4]; + break; + case MTCP_EPOLLHUP: + return event_str[5]; + break; + case MTCP_EPOLLRDHUP: + return event_str[6]; + break; + default: + assert(0); + } + + assert(0); + return NULL; +} +/*----------------------------------------------------------------------------*/ +static struct event_queue *CreateEventQueue(int size) +{ + struct event_queue *eq; + + eq = (struct event_queue *)calloc(1, sizeof(struct event_queue)); + if (!eq) + return NULL; + + eq->start = 0; + eq->end = 0; + eq->size = size; + eq->events = (struct mtcp_epoll_event_int *)calloc(size, sizeof(struct mtcp_epoll_event_int)); + if (!eq->events) { + free(eq); + return NULL; + } + eq->num_events = 0; + + return eq; +} +/*----------------------------------------------------------------------------*/ +static void DestroyEventQueue(struct event_queue *eq) +{ + if (eq->events) + free(eq->events); + + free(eq); +} +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_create1(mctx_t mctx, int flags) +{ + int rc; + struct mtcp_conf mcfg; + + rc = 0; + mtcp_getconf(&mcfg); + + switch (flags) { + case 0: + /* do nothing */ + case O_CLOEXEC: + /* + * this won't work since mTCP apps + * assume that user does not fork/exec + */ + rc = mtcp_epoll_create(mctx, mcfg.max_concurrency * 3); + break; + default: + TRACE_ERROR("[CPU %d] Invalid flags for %s set!\n", mctx->cpu, __FUNCTION__); + errno = EINVAL; + rc = -1; + break; + } + + return rc; +} +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_create(mctx_t mctx, int size) +{ + mtcp_manager_t mtcp = g_mtcp[mctx->cpu]; + struct mtcp_epoll *ep; + socket_map_t epsocket; + + if (size <= 0) { + errno = EINVAL; + return -1; + } + + epsocket = AllocateSocket(mctx, MTCP_SOCK_EPOLL, FALSE); + if (!epsocket) { + errno = ENFILE; + return -1; + } + + ep = (struct mtcp_epoll *)calloc(1, sizeof(struct mtcp_epoll)); + if (!ep) { + FreeSocket(mctx, epsocket->id, FALSE); + return -1; + } + + /* create event queues */ + ep->usr_queue = CreateEventQueue(size); + if (!ep->usr_queue) { + FreeSocket(mctx, epsocket->id, FALSE); + free(ep); + return -1; + } + + ep->usr_shadow_queue = CreateEventQueue(size); + if (!ep->usr_shadow_queue) { + DestroyEventQueue(ep->usr_queue); + FreeSocket(mctx, epsocket->id, FALSE); + free(ep); + return -1; + } + + ep->mtcp_queue = CreateEventQueue(size); + if (!ep->mtcp_queue) { + DestroyEventQueue(ep->usr_shadow_queue); + DestroyEventQueue(ep->usr_queue); + FreeSocket(mctx, epsocket->id, FALSE); + free(ep); + return -1; + } + + TRACE_EPOLL("epoll structure of size %d created.\n", size); + + mtcp->ep = ep; + epsocket->ep = ep; + + if (pthread_mutex_init(&ep->epoll_lock, NULL)) { + DestroyEventQueue(ep->mtcp_queue); + DestroyEventQueue(ep->usr_shadow_queue); + DestroyEventQueue(ep->usr_queue); + FreeSocket(mctx, epsocket->id, FALSE); + free(ep); + return -1; + } + if (pthread_cond_init(&ep->epoll_cond, NULL)) { + DestroyEventQueue(ep->mtcp_queue); + DestroyEventQueue(ep->usr_shadow_queue); + DestroyEventQueue(ep->usr_queue); + FreeSocket(mctx, epsocket->id, FALSE); + free(ep); + return -1; + } + + return epsocket->id; +} +/*----------------------------------------------------------------------------*/ +int CloseEpollSocket(mctx_t mctx, int epid) +{ + mtcp_manager_t mtcp; + struct mtcp_epoll *ep; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + ep = mtcp->smap[epid].ep; + if (!ep) { + errno = EINVAL; + return -1; + } + + DestroyEventQueue(ep->usr_queue); + DestroyEventQueue(ep->usr_shadow_queue); + DestroyEventQueue(ep->mtcp_queue); + + pthread_mutex_lock(&ep->epoll_lock); + mtcp->ep = NULL; + mtcp->smap[epid].ep = NULL; + pthread_cond_signal(&ep->epoll_cond); + pthread_mutex_unlock(&ep->epoll_lock); + + pthread_cond_destroy(&ep->epoll_cond); + pthread_mutex_destroy(&ep->epoll_lock); + free(ep); + + return 0; +} +/*----------------------------------------------------------------------------*/ +static int RaisePendingStreamEvents(mtcp_manager_t mtcp, struct mtcp_epoll *ep, socket_map_t socket) +{ + (void)mtcp; + tcp_stream *stream = socket->stream; + + if (!stream) + return -1; + if (stream->state < TCP_ST_ESTABLISHED) + return -1; + + TRACE_EPOLL("Stream %d at state %s\n", stream->id, TCPStateToString(stream)); + /* if there are payloads already read before epoll registration */ + /* generate read event */ + if (socket->epoll & MTCP_EPOLLIN) { + struct tcp_recv_vars *rcvvar = stream->rcvvar; + if (rcvvar->rcvbuf && rcvvar->rcvbuf->merged_len > 0) { + TRACE_EPOLL("Socket %d: Has existing payloads\n", socket->id); + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } else if (stream->state == TCP_ST_CLOSE_WAIT) { + TRACE_EPOLL("Socket %d: Waiting for close\n", socket->id); + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } + } + + /* same thing to the write event */ + if (socket->epoll & MTCP_EPOLLOUT) { + struct tcp_send_vars *sndvar = stream->sndvar; + if (!sndvar->sndbuf || (sndvar->sndbuf && sndvar->snd_wnd > 0)) { + if (!(socket->events & MTCP_EPOLLOUT)) { + TRACE_EPOLL("Socket %d: Adding write event\n", socket->id); + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT); + } + } + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_ctl(mctx_t mctx, int epid, int op, int sockid, struct mtcp_epoll_event *event) +{ + mtcp_manager_t mtcp; + struct mtcp_epoll *ep; + socket_map_t socket; + uint32_t events; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (epid < 0 || epid >= CONFIG.max_concurrency) { + TRACE_API("Epoll id %d out of range.\n", epid); + errno = EBADF; + return -1; + } + + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + TRACE_API("Socket id %d out of range.\n", sockid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[epid].socktype == MTCP_SOCK_UNUSED) { + errno = EBADF; + return -1; + } + + if (mtcp->smap[epid].socktype != MTCP_SOCK_EPOLL) { + errno = EINVAL; + return -1; + } + + ep = mtcp->smap[epid].ep; + if (!ep || (!event && op != MTCP_EPOLL_CTL_DEL)) { + errno = EINVAL; + return -1; + } + socket = &mtcp->smap[sockid]; + + if (op == MTCP_EPOLL_CTL_ADD) { + if (socket->epoll) { + errno = EEXIST; + return -1; + } + + /* EPOLLERR and EPOLLHUP are registered as default */ + events = event->events; + events |= (MTCP_EPOLLERR | MTCP_EPOLLHUP); + socket->ep_data = event->data; + socket->epoll = events; + + TRACE_EPOLL("Adding epoll socket %d(type %d) ET: %u, IN: %u, OUT: %u\n", socket->id, socket->socktype, + socket->epoll & MTCP_EPOLLET, socket->epoll & MTCP_EPOLLIN, socket->epoll & MTCP_EPOLLOUT); + + if (socket->socktype == MTCP_SOCK_STREAM) { + RaisePendingStreamEvents(mtcp, ep, socket); + } else if (socket->socktype == MTCP_SOCK_PIPE) { + RaisePendingPipeEvents(mctx, epid, sockid); + } + + } else if (op == MTCP_EPOLL_CTL_MOD) { + if (!socket->epoll) { + pthread_mutex_unlock(&ep->epoll_lock); + errno = ENOENT; + return -1; + } + + events = event->events; + events |= (MTCP_EPOLLERR | MTCP_EPOLLHUP); + socket->ep_data = event->data; + socket->epoll = events; + + if (socket->socktype == MTCP_SOCK_STREAM) { + RaisePendingStreamEvents(mtcp, ep, socket); + } else if (socket->socktype == MTCP_SOCK_PIPE) { + RaisePendingPipeEvents(mctx, epid, sockid); + } + + } else if (op == MTCP_EPOLL_CTL_DEL) { + if (!socket->epoll) { + errno = ENOENT; + return -1; + } + + socket->epoll = MTCP_EPOLLNONE; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_wait(mctx_t mctx, int epid, struct mtcp_epoll_event *events, int maxevents, int timeout) +{ + mtcp_manager_t mtcp; + struct mtcp_epoll *ep; + struct event_queue *eq; + struct event_queue *eq_shadow; + socket_map_t event_socket; + int validity; + int i, cnt, ret; + int num_events; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + + if (epid < 0 || epid >= CONFIG.max_concurrency) { + TRACE_API("Epoll id %d out of range.\n", epid); + errno = EBADF; + return -1; + } + + if (mtcp->smap[epid].socktype == MTCP_SOCK_UNUSED) { + errno = EBADF; + return -1; + } + + if (mtcp->smap[epid].socktype != MTCP_SOCK_EPOLL) { + errno = EINVAL; + return -1; + } + + ep = mtcp->smap[epid].ep; + if (!ep || !events || maxevents <= 0) { + errno = EINVAL; + return -1; + } + + ep->stat.calls++; + +#if SPIN_BEFORE_SLEEP + int spin = 0; + while (ep->num_events == 0 && spin < SPIN_THRESH) { + spin++; + } +#endif /* SPIN_BEFORE_SLEEP */ + + if (pthread_mutex_lock(&ep->epoll_lock)) { + if (errno == EDEADLK) + perror("mtcp_epoll_wait: epoll_lock blocked\n"); + assert(0); + } + +wait: + eq = ep->usr_queue; + eq_shadow = ep->usr_shadow_queue; + + /* wait until event occurs */ + while (eq->num_events == 0 && eq_shadow->num_events == 0 && timeout != 0) { +#if INTR_SLEEPING_MTCP + /* signal to mtcp thread if it is sleeping */ + if (mtcp->wakeup_flag && mtcp->is_sleeping) { + pthread_kill(mtcp->ctx->thread, SIGUSR1); + } +#endif + ep->stat.waits++; + ep->waiting = TRUE; + if (timeout > 0) { + struct timespec deadline; + + clock_gettime(CLOCK_REALTIME, &deadline); + if (timeout >= 1000) { + int sec; + sec = timeout / 1000; + deadline.tv_sec += sec; + timeout -= sec * 1000; + } + + deadline.tv_nsec += timeout * 1000000; + + if (deadline.tv_nsec >= 1000000000) { + deadline.tv_sec++; + deadline.tv_nsec -= 1000000000; + } + + //deadline.tv_sec = mtcp->cur_tv.tv_sec; + //deadline.tv_nsec = (mtcp->cur_tv.tv_usec + timeout * 1000) * 1000; + ret = pthread_cond_timedwait(&ep->epoll_cond, &ep->epoll_lock, &deadline); + if (ret && ret != ETIMEDOUT) { + /* errno set by pthread_cond_timedwait() */ + pthread_mutex_unlock(&ep->epoll_lock); + TRACE_ERROR("pthread_cond_timedwait failed. ret: %d, error: %s\n", ret, strerror(errno)); + return -1; + } + timeout = 0; + } else if (timeout < 0) { + ret = pthread_cond_wait(&ep->epoll_cond, &ep->epoll_lock); + if (ret) { + /* errno set by pthread_cond_wait() */ + pthread_mutex_unlock(&ep->epoll_lock); + TRACE_ERROR("pthread_cond_wait failed. ret: %d, error: %s\n", ret, strerror(errno)); + return -1; + } + } + ep->waiting = FALSE; + + if (mtcp->ctx->done || mtcp->ctx->exit || mtcp->ctx->interrupt) { + mtcp->ctx->interrupt = FALSE; + //ret = pthread_cond_signal(&ep->epoll_cond); + pthread_mutex_unlock(&ep->epoll_lock); + errno = EINTR; + return -1; + } + } + + /* fetch events from the user event queue */ + cnt = 0; + num_events = eq->num_events; + for (i = 0; i < num_events && cnt < maxevents; i++) { + event_socket = &mtcp->smap[eq->events[eq->start].sockid]; + validity = TRUE; + if (event_socket->socktype == MTCP_SOCK_UNUSED) + validity = FALSE; + if (!(event_socket->epoll & eq->events[eq->start].ev.events)) + validity = FALSE; + if (!(event_socket->events & eq->events[eq->start].ev.events)) + validity = FALSE; + + if (validity) { + events[cnt++] = eq->events[eq->start].ev; + assert(eq->events[eq->start].sockid >= 0); + + TRACE_EPOLL("Socket %d: Handled event. event: %s, " + "start: %u, end: %u, num: %u\n", + event_socket->id, EventToString(eq->events[eq->start].ev.events), eq->start, eq->end, + eq->num_events); + ep->stat.handled++; + } else { + TRACE_EPOLL("Socket %d: event %s invalidated.\n", eq->events[eq->start].sockid, + EventToString(eq->events[eq->start].ev.events)); + ep->stat.invalidated++; + } + event_socket->events &= (~eq->events[eq->start].ev.events); + + eq->start++; + eq->num_events--; + if (eq->start >= eq->size) { + eq->start = 0; + } + } + + /* fetch eventes from user shadow event queue */ + eq = ep->usr_shadow_queue; + num_events = eq->num_events; + for (i = 0; i < num_events && cnt < maxevents; i++) { + event_socket = &mtcp->smap[eq->events[eq->start].sockid]; + validity = TRUE; + if (event_socket->socktype == MTCP_SOCK_UNUSED) + validity = FALSE; + if (!(event_socket->epoll & eq->events[eq->start].ev.events)) + validity = FALSE; + if (!(event_socket->events & eq->events[eq->start].ev.events)) + validity = FALSE; + + if (validity) { + events[cnt++] = eq->events[eq->start].ev; + assert(eq->events[eq->start].sockid >= 0); + + TRACE_EPOLL("Socket %d: Handled event. event: %s, " + "start: %u, end: %u, num: %u\n", + event_socket->id, EventToString(eq->events[eq->start].ev.events), eq->start, eq->end, + eq->num_events); + ep->stat.handled++; + } else { + TRACE_EPOLL("Socket %d: event %s invalidated.\n", eq->events[eq->start].sockid, + EventToString(eq->events[eq->start].ev.events)); + ep->stat.invalidated++; + } + event_socket->events &= (~eq->events[eq->start].ev.events); + + eq->start++; + eq->num_events--; + if (eq->start >= eq->size) { + eq->start = 0; + } + } + + if (cnt == 0 && timeout != 0) + goto wait; + + pthread_mutex_unlock(&ep->epoll_lock); + + return cnt; +} +/*----------------------------------------------------------------------------*/ +inline int AddEpollEvent(struct mtcp_epoll *ep, int queue_type, socket_map_t socket, uint32_t event) +{ + struct event_queue *eq; + int index; + + if (!ep || !socket || !event) + return -1; + + ep->stat.issued++; + + if (socket->events & event) { + return 0; + } + + if (queue_type == MTCP_EVENT_QUEUE) { + eq = ep->mtcp_queue; + } else if (queue_type == USR_EVENT_QUEUE) { + eq = ep->usr_queue; + pthread_mutex_lock(&ep->epoll_lock); + } else if (queue_type == USR_SHADOW_EVENT_QUEUE) { + eq = ep->usr_shadow_queue; + } else { + TRACE_ERROR("Non-existing event queue type!\n"); + return -1; + } + + if (eq->num_events >= eq->size) { + TRACE_ERROR("Exceeded epoll event queue! num_events: %d, size: %d\n", eq->num_events, eq->size); + if (queue_type == USR_EVENT_QUEUE) + pthread_mutex_unlock(&ep->epoll_lock); + return -1; + } + + index = eq->end++; + + socket->events |= event; + eq->events[index].sockid = socket->id; + eq->events[index].ev.events = event; + eq->events[index].ev.data = socket->ep_data; + + if (eq->end >= eq->size) { + eq->end = 0; + } + eq->num_events++; + +#if 0 + TRACE_EPOLL("Socket %d New event: %s, start: %u, end: %u, num: %u\n", + ep->events[index].sockid, + EventToString(ep->events[index].ev.events), + ep->start, ep->end, ep->num_events); +#endif + + if (queue_type == USR_EVENT_QUEUE) + pthread_mutex_unlock(&ep->epoll_lock); + + ep->stat.registered++; + + return 0; +} diff --git a/lib/flash/mtcp/fhash.c b/lib/flash/mtcp/fhash.c new file mode 100644 index 0000000..79337d1 --- /dev/null +++ b/lib/flash/mtcp/fhash.c @@ -0,0 +1,221 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "fhash.h" + +#define IS_FLOW_TABLE(x) (x == HashFlow) +#define IS_LISTEN_TABLE(x) (x == HashListener) +#if USE_CCP +#define IS_SID_TABLE(x) (x == HashSID) +#endif + +/*----------------------------------------------------------------------------*/ +struct hashtable *CreateHashtable(unsigned int (*hashfn)(const void *), // key function + int (*eqfn)(const void *, const void *), // equality + int bins) // no of bins +{ + int i; + struct hashtable *ht = calloc(1, sizeof(struct hashtable)); + if (!ht) { + TRACE_ERROR("calloc: CreateHashtable"); + return 0; + } + + ht->hashfn = hashfn; + ht->eqfn = eqfn; + ht->bins = bins; + + /* creating bins */ +#if USE_CCP + if (IS_FLOW_TABLE(hashfn) || IS_SID_TABLE(hashfn)) { +#else + if (IS_FLOW_TABLE(hashfn)) { +#endif + ht->ht_table = calloc(bins, sizeof(hash_bucket_head)); + if (!ht->ht_table) { + TRACE_ERROR("calloc: CreateHashtable bins!\n"); + free(ht); + return 0; + } + /* init the tables */ + for (i = 0; i < bins; i++) + TAILQ_INIT(&ht->ht_table[i]); + } else if (IS_LISTEN_TABLE(hashfn)) { + ht->lt_table = calloc(bins, sizeof(list_bucket_head)); + if (!ht->lt_table) { + TRACE_ERROR("calloc: CreateHashtable bins!\n"); + free(ht); + return 0; + } + /* init the tables */ + for (i = 0; i < bins; i++) + TAILQ_INIT(&ht->lt_table[i]); + } + + return ht; +} +/*----------------------------------------------------------------------------*/ +void DestroyHashtable(struct hashtable *ht) +{ + if (IS_FLOW_TABLE(ht->hashfn)) + free(ht->ht_table); + else /* IS_LISTEN_TABLE(ht->hashfn) */ + free(ht->lt_table); + free(ht); +} +/*----------------------------------------------------------------------------*/ +int StreamHTInsert(struct hashtable *ht, void *it) +{ + /* create an entry*/ + int idx; + tcp_stream *item = (tcp_stream *)it; + + assert(ht); + + idx = ht->hashfn(item); + assert(idx >= 0 && idx < NUM_BINS_FLOWS); + + TAILQ_INSERT_TAIL(&ht->ht_table[idx], item, rcvvar->he_link); + + item->ht_idx = TCP_AR_CNT; + + return 0; +} +/*----------------------------------------------------------------------------*/ +void *StreamHTRemove(struct hashtable *ht, void *it) +{ + hash_bucket_head *head; + tcp_stream *item = (tcp_stream *)it; + int idx = ht->hashfn(item); + + head = &ht->ht_table[idx]; + TAILQ_REMOVE(head, item, rcvvar->he_link); + + return (item); +} +/*----------------------------------------------------------------------------*/ +void *StreamHTSearch(struct hashtable *ht, const void *it) +{ + int idx; + const tcp_stream *item = (const tcp_stream *)it; + tcp_stream *walk; + hash_bucket_head *head; + + idx = ht->hashfn(item); + + head = &ht->ht_table[ht->hashfn(item)]; + TAILQ_FOREACH(walk, head, rcvvar->he_link) + { + if (ht->eqfn(walk, item)) + return walk; + } + + UNUSED(idx); + return NULL; +} +/*----------------------------------------------------------------------------*/ +unsigned int HashListener(const void *l) +{ + const struct tcp_listener *listener = (const struct tcp_listener *)l; + + return listener->socket->saddr.sin_port & (NUM_BINS_LISTENERS - 1); +} +/*----------------------------------------------------------------------------*/ +int EqualListener(const void *l1, const void *l2) +{ + const struct tcp_listener *listener1 = (const struct tcp_listener *)l1; + const struct tcp_listener *listener2 = (const struct tcp_listener *)l2; + + return (listener1->socket->saddr.sin_port == listener2->socket->saddr.sin_port); +} +/*----------------------------------------------------------------------------*/ +int ListenerHTInsert(struct hashtable *ht, void *it) +{ + /* create an entry*/ + int idx; + struct tcp_listener *item = (struct tcp_listener *)it; + + assert(ht); + + idx = ht->hashfn(item); + assert(idx >= 0 && idx < NUM_BINS_LISTENERS); + + TAILQ_INSERT_TAIL(&ht->lt_table[idx], item, he_link); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void *ListenerHTRemove(struct hashtable *ht, void *it) +{ + list_bucket_head *head; + struct tcp_listener *item = (struct tcp_listener *)it; + int idx = ht->hashfn(item); + + head = &ht->lt_table[idx]; + TAILQ_REMOVE(head, item, he_link); + + return (item); +} +/*----------------------------------------------------------------------------*/ +void *ListenerHTSearch(struct hashtable *ht, const void *it) +{ + int idx; + struct tcp_listener item; + uint16_t port = *((const uint16_t *)it); + struct tcp_listener *walk; + list_bucket_head *head; + struct socket_map s; + + s.saddr.sin_port = port; + item.socket = &s; + + idx = ht->hashfn(&item); + + head = &ht->lt_table[idx]; + TAILQ_FOREACH(walk, head, he_link) + { + if (ht->eqfn(walk, &item)) + return walk; + } + + return NULL; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/flash_module.c b/lib/flash/mtcp/flash_module.c new file mode 100644 index 0000000..295d027 --- /dev/null +++ b/lib/flash/mtcp/flash_module.c @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Code taken from https://github.com/mcabranches/mtcp/tree/af_xdp_support and extended by + * Debojeet Das for Flash NF support using Flash API. +*/ + +#include "io_module.h" +#ifndef DISABLE_AFXDP + +#include +#include +#include + +/* for mtcp related def'ns */ +#include "mtcp.h" +/* for errno */ +#include +/* for logging */ +#include "debug.h" +/* for num_devices_* */ +#include "config.h" +/* for ETHER_CRC_LEN */ +#include + +/*----------------------------------------------------------------------------*/ +#define MAX_IFNAMELEN (IF_NAMESIZE + 10) + +/* + * Ethernet frame overhead + */ +#define ETHER_IFG 12 +#define ETHER_PREAMBLE 8 +#define ETHER_OVR (ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG) + +/*----------------------------------------------------------------------------*/ + +struct afxdp_private_context { // private context on mTCP + struct config cfg; + struct nf *nf; + struct xskvec *recvvecs; + struct xskvec *sendvecs; + struct xskvec *dropvecs; + uint32_t recv_index; + uint32_t send_index; + struct pollfd fds[1]; +} __attribute__((aligned(__WORDSIZE))); + +/*----------------------------------------------------------------------------*/ +void afxdp_load_module(void); +void afxdp_init_handle(struct mtcp_thread_context *ctxt); +int afxdp_link_devices(struct mtcp_thread_context *ctxt); +int afxdp_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx); +int afxdp_send_pkts(struct mtcp_thread_context *ctxt, int ifidx); +void afxdp_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len); +void afxdp_drop_pkts(struct mtcp_thread_context *ctxt); +uint8_t *afxdp_get_wptr(struct mtcp_thread_context *ctxt, int ifidx, uint16_t len); +uint8_t *afxdp_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len); +int afxdp_select(struct mtcp_thread_context *ctxt); +void afxdp_destroy_handle(struct mtcp_thread_context *ctxt); +int afxdp_dev_ioctl(struct mtcp_thread_context *ctxt, int nif, int cmd, void *argp); + +/*----------------------------------------------------------------------------*/ +void afxdp_load_module(void) +{ + /* not needed - all initializations done in afxdp_init_handle() */ +} + +/*----------------------------------------------------------------------------*/ +void afxdp_init_handle(struct mtcp_thread_context *ctxt) +{ + struct afxdp_private_context *axpc; + int j; + + /* create and initialize private I/O module context */ + ctxt->io_private_context = calloc(1, sizeof(struct afxdp_private_context)); + if (ctxt->io_private_context == NULL) { + TRACE_ERROR("Failed to initialize ctxt->io_private_context: " + "Can't allocate memory\n"); + exit(EXIT_FAILURE); + } + + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + // m-> come back here to evaluate the multiple interface scenario + // d-> I am also assuming that there is only one interface + for (j = 0; j < num_devices_attached; j++) { + axpc->cfg.app_name = "MTCP"; + axpc->cfg.app_options = NULL; + + // custom argv for the afxdp module + char *argv[6]; + argv[0] = strdup("mtcp"); + argv[1] = strdup("-u"); + argv[2] = strdup("0"); + argv[3] = strdup("-f"); + argv[4] = strdup("0"); + argv[5] = strdup("-t"); + + if (flash__parse_cmdline_args(6, argv, &axpc->cfg) < 0) + goto out_cfg; + + if (flash__configure_nf(&axpc->nf, &axpc->cfg) < 0) + goto out_cfg; + + log_info("Control Plane setup done..."); + + // m-> set the receiving queue to the processing core number + // d-> i am consdering this info is setup from the config file + // axpc->cfg.ifname = ifname; + // axpc->cfg.xsk_if_queue = ctxt->cpu; + + // axpc->packet_buffer_size = NUM_FRAMES * FRAME_SIZE; + // if (posix_memalign(&axpc->packet_buffer, getpagesize(), /* PAGE_SIZE aligned */ + // axpc->packet_buffer_size)) { + // fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", strerror(errno)); + // exit(EXIT_FAILURE); + // } + + // /* Initialize shared packet_buffer for umem usage */ + // axpc->umem = configure_xsk_umem(axpc->packet_buffer, axpc->packet_buffer_size); + // if (axpc->umem == NULL) { + // fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", strerror(errno)); + // exit(EXIT_FAILURE); + // } + + /* Open and configure the AF_XDP (xsk) socket */ + // axpc->xsk_socket = xsk_configure_socket(&axpc->cfg, axpc->umem); + // if (axpc->xsk_socket == NULL) { + // fprintf(stderr, "ERROR: Can't setup AF_XDP socket \"%s\"\n", strerror(errno)); + // exit(EXIT_FAILURE); + // } + } + + memset(axpc->fds, 0, sizeof(axpc->fds)); + axpc->fds[0].fd = axpc->nf->thread[0]->socket->fd; // d-> Assuming single thread for now + axpc->fds[0].events = POLLIN; + + // d-> initialize send vectors + axpc->sendvecs = calloc(axpc->cfg.xsk->batch_size, sizeof(struct xskvec)); + if (!axpc->sendvecs) { + log_error("Failed to allocate xskvecs array"); + goto out_cfg_close; + } + axpc->send_index = 0; + + axpc->recvvecs = calloc(axpc->cfg.xsk->batch_size, sizeof(struct xskvec)); + if (!axpc->recvvecs) { + log_error("Failed to allocate recv xskvecs array"); + free(axpc->sendvecs); + goto out_cfg_close; + } + axpc->recv_index = 0; + + axpc->dropvecs = calloc(axpc->cfg.xsk->batch_size, sizeof(struct xskvec)); + if (!axpc->dropvecs) { + log_error("Failed to allocate drop xskvecs array"); + free(axpc->sendvecs); + free(axpc->recvvecs); + goto out_cfg_close; + } + + return; + +out_cfg_close: + flash__xsk_close(&axpc->cfg, axpc->nf); +out_cfg: + free(&axpc->cfg); + exit(EXIT_FAILURE); +} + +/*----------------------------------------------------------------------------*/ +int afxdp_link_devices(struct mtcp_thread_context *ctxt) +{ + (void)ctxt; // d-> unused parameter + /* linking takes place during mtcp_init() */ + return 0; +} + +/*----------------------------------------------------------------------------*/ +int32_t afxdp_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + (void)ifidx; // d-> unused parameter + int ret, nfds = 1; + uint32_t nrecv = 0; + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + struct socket *xsk = axpc->nf->thread[0]->socket; // d-> Assuming single thread for now + + ret = flash__poll(&axpc->cfg, xsk, axpc->fds, nfds); + if (!(ret == 1 || ret == -2)) + return 0; + + nrecv = flash__recvmsg(&axpc->cfg, xsk, axpc->recvvecs, axpc->cfg.xsk->batch_size); + return nrecv; +} + +/*----------------------------------------------------------------------------*/ +// m-> function to return the pointers to mTCP (This should iterate through to the number of +// recv pkts and return pointers to pkts to be processed by mTCP) +uint8_t *afxdp_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len) +{ + (void)ifidx; // d-> unused parameter + (void)index; // d-> unused parameter + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + printf("get_rptr: recv_index=%u\n", axpc->recv_index); + + uint8_t *pktbuf = axpc->recvvecs[axpc->recv_index].data; + *len = axpc->recvvecs[axpc->recv_index].len; + + axpc->dropvecs[axpc->recv_index] = axpc->recvvecs[axpc->recv_index]; + axpc->recv_index++; + + return pktbuf; +} + +/*----------------------------------------------------------------------------*/ +void afxdp_drop_pkts(struct mtcp_thread_context *ctxt) +{ + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + if (flash__dropmsg(&axpc->cfg, axpc->nf->thread[0]->socket, axpc->dropvecs, axpc->recv_index) != axpc->recv_index) { + log_error("Failed to drop messages"); + axpc->recv_index = 0; + return; + } + axpc->recv_index = 0; +} + +/*----------------------------------------------------------------------------*/ +void afxdp_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len) +{ + (void)ctxt; // d-> unused parameter + (void)ifidx; // d-> unused parameter + (void)pkt_data; // d-> unused parameter + (void)len; // d-> unused parameter + /* not needed - drop packets is handled seperately */ +} + +/*----------------------------------------------------------------------------*/ +//m-> for more details of what needs to be done see my notebook (mtcp pktio), pages 44 and 54 +//also see the get_wptr dpdk function to see more details +uint8_t *afxdp_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize) +{ + (void)nif; // d-> unused parameter + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + printf("get_wptr: send_index=%u\n", axpc->send_index); + + struct xskvec tmpvec; + flash__allocmsg(&axpc->cfg, axpc->nf->thread[0]->socket, &tmpvec, 1); + + uint8_t *pktbuf = tmpvec.data; + + axpc->sendvecs[axpc->send_index].data = pktbuf; + axpc->sendvecs[axpc->send_index].len = pktsize; + axpc->sendvecs[axpc->send_index].addr = tmpvec.addr; + axpc->sendvecs[axpc->send_index++].options = 0; + + return pktbuf; +} + +/*----------------------------------------------------------------------------*/ +int afxdp_send_pkts(struct mtcp_thread_context *ctxt, int nif) +{ + (void)nif; // d-> unused parameter + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + flash__sendmsg(&axpc->cfg, axpc->nf->thread[0]->socket, axpc->sendvecs, axpc->send_index); + axpc->send_index = 0; + + return 1; +} + +/*----------------------------------------------------------------------------*/ +int32_t afxdp_select(struct mtcp_thread_context *ctxt) +{ + (void)ctxt; // d-> unused parameter + // m-> implement + // d-> implement + return 0; // d-> return 0 for now, as select is not implemented +} + +/*----------------------------------------------------------------------------*/ +void afxdp_destroy_handle(struct mtcp_thread_context *ctxt) +{ + struct afxdp_private_context *axpc; + axpc = (struct afxdp_private_context *)ctxt->io_private_context; + + free(axpc->recvvecs); + free(axpc->sendvecs); + free(axpc->dropvecs); + flash__xsk_close(&axpc->cfg, axpc->nf); + free(&axpc->cfg); + free(axpc); +} + +/*----------------------------------------------------------------------------*/ +io_module_func afxdp_module_func = { .load_module = afxdp_load_module, + .init_handle = afxdp_init_handle, + .link_devices = afxdp_link_devices, + .recv_pkts = afxdp_recv_pkts, + .get_rptr = afxdp_get_rptr, + .drop_pkts = afxdp_drop_pkts, + .release_pkt = afxdp_release_pkt, + .get_wptr = afxdp_get_wptr, + .send_pkts = afxdp_send_pkts, + .select = afxdp_select, + .destroy_handle = afxdp_destroy_handle, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#endif /* !DISABLE_AFXDP */ diff --git a/lib/flash/mtcp/icmp.c b/lib/flash/mtcp/icmp.c new file mode 100644 index 0000000..e412805 --- /dev/null +++ b/lib/flash/mtcp/icmp.c @@ -0,0 +1,178 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include "mtcp.h" +#include "icmp.h" +#include "eth_out.h" +#include "ip_in.h" +#include "ip_out.h" +#include "debug.h" +#include "arp.h" + +#define IP_NEXT_PTR(iph) ((uint8_t *)iph + (iph->ihl << 2)) +/*----------------------------------------------------------------------------*/ +void DumpICMPPacket(mtcp_manager_t mtcp, struct icmphdr *icmph, uint32_t saddr, uint32_t daddr); +/*----------------------------------------------------------------------------*/ +static uint16_t ICMPChecksum(uint16_t *icmph, int len) +{ + assert(len >= 0); + + uint16_t ret = 0; + uint32_t sum = 0; + uint16_t odd_byte; + + while (len > 1) { + sum += *icmph++; + len -= 2; + } + + if (len == 1) { + *(uint8_t *)(&odd_byte) = *(uint8_t *)icmph; + sum += odd_byte; + } + + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + ret = ~sum; + + return ret; +} +/*----------------------------------------------------------------------------*/ +static int ICMPOutput(struct mtcp_manager *mtcp, uint32_t saddr, uint32_t daddr, uint8_t icmp_type, uint8_t icmp_code, + uint16_t icmp_id, uint16_t icmp_seq, uint8_t *icmpd, uint16_t len) +{ + struct icmphdr *icmph; + + icmph = (struct icmphdr *)IPOutputStandalone(mtcp, IPPROTO_ICMP, 0, saddr, daddr, sizeof(struct icmphdr) + len); + if (!icmph) + return -1; + + /* Fill in the icmp header */ + icmph->icmp_type = icmp_type; + icmph->icmp_code = icmp_code; + icmph->icmp_checksum = 0; + ICMP_ECHO_SET_ID(icmph, htons(icmp_id)); + ICMP_ECHO_SET_SEQ(icmph, htons(icmp_seq)); + + /* Fill in the icmp data */ + if (len > 0) + memcpy((void *)(icmph + 1), icmpd, len); + + /* Calculate ICMP Checksum with header and data */ + icmph->icmp_checksum = ICMPChecksum((uint16_t *)icmph, sizeof(struct icmphdr) + len); + +#if defined(DBGMSG) + DumpICMPPacket(mtcp, icmph, saddr, daddr); +#endif + return 0; +} +/*----------------------------------------------------------------------------*/ +void RequestICMP(mtcp_manager_t mtcp, uint32_t saddr, uint32_t daddr, uint16_t icmp_id, uint16_t icmp_sequence, uint8_t *icmpd, + uint16_t len) +{ + /* send icmp request with given parameters */ + ICMPOutput(mtcp, saddr, daddr, ICMP_ECHO, 0, ntohs(icmp_id), ntohs(icmp_sequence), icmpd, len); +} +/*----------------------------------------------------------------------------*/ +static int ProcessICMPECHORequest(mtcp_manager_t mtcp, struct iphdr *iph, int len) +{ + int ret = 0; + struct icmphdr *icmph = (struct icmphdr *)IP_NEXT_PTR(iph); + /* Check correctness of ICMP checksum and send ICMP echo reply */ + if (ICMPChecksum((uint16_t *)icmph, len - (iph->ihl << 2))) + ret = ERROR; + else + ICMPOutput(mtcp, iph->daddr, iph->saddr, ICMP_ECHOREPLY, 0, ntohs(ICMP_ECHO_GET_ID(icmph)), + ntohs(ICMP_ECHO_GET_SEQ(icmph)), (uint8_t *)(icmph + 1), + (uint16_t)(len - (iph->ihl << 2) - sizeof(struct icmphdr))); + + return ret; +} +/*----------------------------------------------------------------------------*/ +int ProcessICMPPacket(mtcp_manager_t mtcp, struct iphdr *iph, int len) +{ + struct icmphdr *icmph = (struct icmphdr *)IP_NEXT_PTR(iph); + int i; + int to_me = FALSE; + + /* process the icmp messages destined to me */ + for (i = 0; i < CONFIG.eths_num; i++) { + if (iph->daddr == CONFIG.eths[i].ip_addr) { + to_me = TRUE; + } + } + + if (!to_me) + return TRUE; + + switch (icmph->icmp_type) { + case ICMP_ECHO: + ProcessICMPECHORequest(mtcp, iph, len); + break; + + case ICMP_DEST_UNREACH: + TRACE_INFO("[INFO] ICMP Destination Unreachable message received\n"); + break; + + case ICMP_TIME_EXCEEDED: + TRACE_INFO("[INFO] ICMP Time Exceeded message received\n"); + break; + + default: + TRACE_INFO("[INFO] Unsupported ICMP message type %x received\n", icmph->icmp_type); + break; + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +void DumpICMPPacket(mtcp_manager_t mtcp, struct icmphdr *icmph, uint32_t saddr, uint32_t daddr) +{ + uint8_t *t; + + thread_printf(mtcp, mtcp->log_fp, "ICMP header: \n"); + thread_printf(mtcp, mtcp->log_fp, + "Type: %d, " + "Code: %d, ID: %d, Sequence: %d\n", + icmph->icmp_type, icmph->icmp_code, ntohs(ICMP_ECHO_GET_ID(icmph)), ntohs(ICMP_ECHO_GET_SEQ(icmph))); + + t = (uint8_t *)&saddr; + thread_printf(mtcp, mtcp->log_fp, "Sender IP: %u.%u.%u.%u\n", t[0], t[1], t[2], t[3]); + + t = (uint8_t *)&daddr; + thread_printf(mtcp, mtcp->log_fp, "Target IP: %u.%u.%u.%u\n", t[0], t[1], t[2], t[3]); +} +/*----------------------------------------------------------------------------*/ +#undef IP_NEXT_PTR diff --git a/lib/flash/mtcp/include/addr_pool.h b/lib/flash/mtcp/include/addr_pool.h new file mode 100644 index 0000000..7e20115 --- /dev/null +++ b/lib/flash/mtcp/include/addr_pool.h @@ -0,0 +1,65 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ADDR_POOL_H +#define ADDR_POOL_H + +#include +#include + +#define MIN_PORT (1025) +#define MAX_PORT (65535 + 1) +/*----------------------------------------------------------------------------*/ +typedef struct addr_pool *addr_pool_t; +/*----------------------------------------------------------------------------*/ +/* CreateAddressPool() */ +/* Create address pool for given address range. */ +/* addr_base: the base address in network order. */ +/* num_addr: number of addresses to use as source IP */ +/*----------------------------------------------------------------------------*/ +addr_pool_t CreateAddressPool(in_addr_t addr_base, int num_addr); +/*----------------------------------------------------------------------------*/ +/* CreateAddressPoolPerCore() */ +/* Create address pool only for the given core number. */ +/* All addresses and port numbers should be in network order. */ +/*----------------------------------------------------------------------------*/ +addr_pool_t CreateAddressPoolPerCore(int core, int num_queues, in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_port_t dport); +/*----------------------------------------------------------------------------*/ +void DestroyAddressPool(addr_pool_t ap); +/*----------------------------------------------------------------------------*/ +int FetchAddress(addr_pool_t ap, int core, int num_queues, const struct sockaddr_in *daddr, struct sockaddr_in *saddr); +/*----------------------------------------------------------------------------*/ +int FetchAddressPerCore(addr_pool_t ap, int core, int num_queues, const struct sockaddr_in *daddr, struct sockaddr_in *saddr); +/*----------------------------------------------------------------------------*/ +int FreeAddress(addr_pool_t ap, const struct sockaddr_in *addr); +/*----------------------------------------------------------------------------*/ + +#endif /* ADDR_POOL_H */ diff --git a/lib/flash/mtcp/include/arp.h b/lib/flash/mtcp/include/arp.h new file mode 100644 index 0000000..bc26c4f --- /dev/null +++ b/lib/flash/mtcp/include/arp.h @@ -0,0 +1,51 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ARP_H +#define ARP_H + +#define MAX_ARPENTRY 1024 + +int InitARPTable(void); + +unsigned char *GetHWaddr(uint32_t ip); + +unsigned char *GetDestinationHWaddr(uint32_t dip, uint8_t is_gateway); + +void RequestARP(mtcp_manager_t mtcp, uint32_t ip, int nif, uint32_t cur_ts); + +int ProcessARPPacket(mtcp_manager_t mtcp, uint32_t cur_ts, const int ifidx, unsigned char *pkt_data, int len); + +void ARPTimer(mtcp_manager_t mtcp, uint32_t cur_ts); + +void PrintARPTable(void); + +#endif /* ARP_H */ diff --git a/lib/flash/mtcp/include/ccp.h b/lib/flash/mtcp/include/ccp.h new file mode 100644 index 0000000..bf2daa8 --- /dev/null +++ b/lib/flash/mtcp/include/ccp.h @@ -0,0 +1,68 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CCP_H_ +#define __CCP_H_ + +#include + +#include "tcp_stream.h" +#include "tcp_in.h" +#include "debug.h" + +// CCP currently only supports a single global datapath and CCP instance, but +// this ID exists in case there is a need for supporting multiple +// If this change is made in the future CCP_UNIX_BASE_ID will need to be +// generated dynamically based on the CCP/datapath ID. For now, we always use 0. +#define CCP_UNIX_BASE "/tmp/ccp/" +#define CCP_ID "0/" +#define FROM_CCP "out" +#define TO_CCP "in" +#define FROM_CCP_PATH CCP_UNIX_BASE CCP_ID FROM_CCP +#define TO_CCP_PATH CCP_UNIX_BASE CCP_ID TO_CCP +#define CCP_MAX_MSG_SIZE 32678 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define EVENT_DUPACK 1 +#define EVENT_TRI_DUPACK 2 +#define EVENT_TIMEOUT 3 +#define EVENT_ECN 4 + +void setup_ccp_connection(mtcp_manager_t mtcp); +void setup_ccp_send_socket(mtcp_manager_t mtcp); +void destroy_ccp_connection(mtcp_manager_t mtcp); +void ccp_create(mtcp_manager_t mtcp, tcp_stream *stream); +void ccp_cong_control(mtcp_manager_t mtcp, tcp_stream *stream, uint32_t ack, uint64_t bytes_delivered, uint64_t packets_delivered); +void ccp_record_event(mtcp_manager_t mtcp, tcp_stream *stream, uint8_t event_type, uint32_t val); + +#endif diff --git a/lib/flash/mtcp/include/clock.h b/lib/flash/mtcp/include/clock.h new file mode 100644 index 0000000..573895c --- /dev/null +++ b/lib/flash/mtcp/include/clock.h @@ -0,0 +1,44 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CLOCK__H_ +#define __CLOCK__H_ + +#include +#include +#include "tcp_stream.h" + +uint64_t now_usecs(void); +uint64_t time_since_usecs(void); +uint64_t time_after_usecs(void); +void log_cwnd_rtt(void *stream); + +#endif diff --git a/lib/flash/mtcp/include/config.h b/lib/flash/mtcp/include/config.h new file mode 100644 index 0000000..d1aa938 --- /dev/null +++ b/lib/flash/mtcp/include/config.h @@ -0,0 +1,69 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CONFIG_H +#define CONFIG_H + +#include "ps.h" + +extern int num_cpus; +extern int num_queues; +extern int num_devices; + +extern int num_devices_attached; +extern int devices_attached[MAX_DEVICES]; + +int LoadConfiguration(const char *fname); + +/* set configurations from the setted + interface information */ +int SetInterfaceInfo(void); + +/* set configurations from the files */ +int SetRoutingTable(void); + +int LoadARPTable(void); + +/* print setted configuration */ +void PrintConfiguration(void); + +void PrintInterfaceInfo(void); + +void PrintRoutingTable(void); + +/* fetch mask from prefix */ +uint32_t MaskFromPrefix(int prefix); + +void ParseMACAddress(unsigned char *haddr, char *haddr_str); + +int ParseIPAddress(uint32_t *ip_addr, char *ip_str); + +#endif /* CONFIG_H */ diff --git a/lib/flash/mtcp/include/cpu.h b/lib/flash/mtcp/include/cpu.h new file mode 100644 index 0000000..8ba6fae --- /dev/null +++ b/lib/flash/mtcp/include/cpu.h @@ -0,0 +1,39 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CPU_H +#define CPU_H + +inline int GetNumCPUs(void); + +inline int whichCoreID(int thread_no); + +#endif /* CPU_H */ diff --git a/lib/flash/mtcp/include/debug.h b/lib/flash/mtcp/include/debug.h new file mode 100644 index 0000000..eece78a --- /dev/null +++ b/lib/flash/mtcp/include/debug.h @@ -0,0 +1,272 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DEBUG_H +#define DEBUG_H + +#include +#include +#include +#include "mtcp.h" +#include "tcp_in.h" + +#ifdef DBGTEMP + +#define TRACE_TEMP(f, m...) \ + { \ + fprintf(stderr, "[CPU %d][%10s:%4d] " f, mtcp->ctx->cpu, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define TRACE_TEMP(f, m...) (void)0 + +#endif /* DBGTEMP*/ + +#ifdef DBGERR + +#define TRACE_ERROR(f, m...) \ + { \ + fprintf(stderr, "[%10s:%4d] " f, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define TRACE_ERROR(f, m...) (void)0 + +#endif /* DBGERR */ + +#ifdef DBGCERR + +#define CTRACE_ERROR(f, m...) \ + { \ + fprintf(stderr, "[CPU %d][%10s:%4d] " f, mtcp->ctx->cpu, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define CTRACE_ERROR(f, m...) (void)0 + +#endif /* DBGERR */ + +#ifdef DBGMSG + +#define TRACE_DBG(f, m...) \ + { \ + thread_printf(mtcp, mtcp->log_fp, "[%10s:%4d] " f, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define TRACE_DBG(f, m...) (void)0 + +#endif /* DBGMSG */ + +#ifdef INFO + +#define TRACE_INFO(f, m...) \ + { \ + fprintf(stderr, "[%10s:%4d] " f, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define TRACE_INFO(f, m...) (void)0 + +#endif /* INFO */ + +#define TRACE_CONFIG(f, m...) fprintf(stderr, f, ##m) + +#ifdef DBGLOG +#define TRACE_LOG(f, m...) TRACE_INFO(f, ##m) +#else +#define TRACE_LOG(f, m...) (void)0 +#endif + +#ifdef STREAM +#define TRACE_STREAM(f, m...) TRACE_FUNC("STREAM", f, ##m) +#else +#define TRACE_STREAM(f, m...) (void)0 +#endif + +#ifdef STATE +#define TRACE_STATE(f, m...) TRACE_FUNC("STATE", f, ##m) +#else +#define TRACE_STATE(f, m...) (void)0 +#endif + +#ifdef SNDBUF +#define TRACE_SNDBUF(f, m...) TRACE_FUNC("SNDBUF", f, ##m) +#else +#define TRACE_SNDBUF(f, m...) (void)0 +#endif + +#ifdef RCVBUF +#define TRACE_RCVBUF(f, m...) TRACE_FUNC("RCVBUF", f, ##m) +#else +#define TRACE_RCVBUF(f, m...) (void)0 +#endif + +#ifdef CLWND +#define TRACE_CLWND(f, m...) TRACE_FUNC("CLWND", f, ##m) +#else +#define TRACE_CLWND(f, m...) (void)0 +#endif + +#ifdef LOSS +#define TRACE_LOSS(f, m...) TRACE_FUNC("LOSS", f, ##m) +#else +#define TRACE_LOSS(f, m...) (void)0 +#endif + +#ifdef SACK +#define TRACE_SACK(f, m...) TRACE_FUNC("SACK", f, ##m) +#else +#define TRACE_SACK(f, m...) (void)0 +#endif + +#ifdef TSTAMP +#define TRACE_TSTAMP(f, m...) TRACE_FUNC("TSTAMP", f, ##m) +#else +#define TRACE_TSTAMP(f, m...) (void)0 +#endif + +#ifdef RTT +#define TRACE_RTT(f, m...) TRACE_FUNC("RTT", f, ##m) +#else +#define TRACE_RTT(f, m...) (void)0 +#endif + +#ifdef RTO +#define TRACE_RTO(f, m...) TRACE_FUNC("RTO", f, ##m) +#else +#define TRACE_RTO(f, m...) (void)0 +#endif + +#ifdef CONG +#define TRACE_CONG(f, m...) TRACE_FUNC("CONG", f, ##m) +#else +#define TRACE_CONG(f, m...) (void)0 +#endif + +#ifdef EPOLL +#define TRACE_EPOLL(f, m...) TRACE_FUNC("EPOLL", f, ##m) +#else +#define TRACE_EPOLL(f, m...) (void)0 +#endif + +#ifdef FSTAT +#define TRACE_FSTAT(f, m...) TRACE_FUNC("FSTAT", f, ##m) +#else +#define TRACE_FSTAT(f, m...) (void)0 +#endif + +#ifdef APP +#define TRACE_APP(f, m...) TRACE_FUNC("APP", f, ##m) +#else +#define TRACE_APP(f, m...) (void)0 +#endif + +#ifdef DBGFIN +#define TRACE_FIN(f, m...) TRACE_FUNC("FIN", f, ##m) +#else +#define TRACE_FIN(f, m...) (void)0 +#endif + +#ifdef TSTAT +#define TRACE_TSTAT(f, m...) TRACE_FUNC("TSTAT", f, ##m) +#else +#define TRACE_TSTAT(f, m...) (void)0 +#endif + +#ifdef LOOP +#define TRACE_LOOP(f, m...) TRACE_FUNC("LOOP", "ts: %u, " f, cur_ts, ##m) +#else +#define TRACE_LOOP(f, m...) (void)0 +#endif + +#ifdef ROUND +#define TRACE_ROUND(f, m...) TRACE_FUNC("ROUND", f, ##m) +#else +#define TRACE_ROUND(f, m...) (void)0 +#endif + +#ifdef SELECT +#define TRACE_SELECT(f, m...) TRACE_FUNC("SELECT", f, ##m) +#else +#define TRACE_SELECT(f, m...) (void)0 +#endif + +#ifdef API +#define TRACE_API(f, m...) TRACE_FUNC("API", f, ##m) +#else +#define TRACE_API(f, m...) (void)0 +#endif + +#ifdef DBGCCP +#define TRACE_CCP(f, m...) TRACE_FUNC("CCP", f, ##m) +#else +#define TRACE_CCP(f, m...) (void)0 +#endif + +#ifdef PROBECCP +#define CCP_PROBE(f, m...) \ + { \ + fprintf(stderr, f, ##m); \ + } +#else +#define CCP_PROBE(f, m...) (void)0 +#endif + +#ifdef DBGFUNC + +#define TRACE_FUNC(n, f, m...) \ + { \ + thread_printf(mtcp, mtcp->log_fp, "[%6s: %10s:%4d] " f, n, __FUNCTION__, __LINE__, ##m); \ + } + +#else + +#define TRACE_FUNC(f, m...) (void)0 + +#endif /* DBGFUNC */ + +void DumpPacket(mtcp_manager_t mtcp, char *buf, int len, char *step, int ifindex); + +void DumpIPPacket(mtcp_manager_t mtcp, const struct iphdr *iph, int len); + +void DumpIPPacketToFile(FILE *fout, const struct iphdr *iph, int len); + +void flush_log_data(mtcp_manager_t mtcp); + +void thread_printf(mtcp_manager_t mtcp, FILE *f_idx, const char *_Format, ...); + +#endif /* DEBUG_H */ diff --git a/lib/flash/mtcp/include/eth_in.h b/lib/flash/mtcp/include/eth_in.h new file mode 100644 index 0000000..8b1c05d --- /dev/null +++ b/lib/flash/mtcp/include/eth_in.h @@ -0,0 +1,39 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ETH_IN_H +#define ETH_IN_H + +#include "mtcp.h" + +int ProcessPacket(mtcp_manager_t mtcp, const int ifidx, uint32_t cur_ts, unsigned char *pkt_data, int len); + +#endif /* ETH_IN_H */ diff --git a/lib/flash/mtcp/include/eth_out.h b/lib/flash/mtcp/include/eth_out.h new file mode 100644 index 0000000..5529960 --- /dev/null +++ b/lib/flash/mtcp/include/eth_out.h @@ -0,0 +1,45 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ETH_OUT_H +#define ETH_OUT_H + +#include + +#include "mtcp.h" +#include "tcp_stream.h" +#include "ps.h" + +#define MAX_SEND_PCK_CHUNK 64 + +uint8_t *EthernetOutput(struct mtcp_manager *mtcp, uint16_t h_proto, int nif, unsigned char *dst_haddr, uint16_t iplen); + +#endif /* ETH_OUT_H */ diff --git a/lib/flash/mtcp/include/eventpoll.h b/lib/flash/mtcp/include/eventpoll.h new file mode 100644 index 0000000..239cdc9 --- /dev/null +++ b/lib/flash/mtcp/include/eventpoll.h @@ -0,0 +1,81 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EVENTPOLL_H +#define EVENTPOLL_H + +#include "mtcp_api.h" +#include "mtcp_epoll.h" + +/*----------------------------------------------------------------------------*/ +struct mtcp_epoll_stat { + uint64_t calls; + uint64_t waits; + uint64_t wakes; + + uint64_t issued; + uint64_t registered; + uint64_t invalidated; + uint64_t handled; +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_epoll_event_int { + struct mtcp_epoll_event ev; + int sockid; +}; +/*----------------------------------------------------------------------------*/ +enum event_queue_type { USR_EVENT_QUEUE = 0, USR_SHADOW_EVENT_QUEUE = 1, MTCP_EVENT_QUEUE = 2 }; +/*----------------------------------------------------------------------------*/ +struct event_queue { + struct mtcp_epoll_event_int *events; + int start; // starting index + int end; // ending index + + int size; // max size + int num_events; // number of events +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_epoll { + struct event_queue *usr_queue; + struct event_queue *usr_shadow_queue; + struct event_queue *mtcp_queue; + + uint8_t waiting; + struct mtcp_epoll_stat stat; + + pthread_cond_t epoll_cond; + pthread_mutex_t epoll_lock; +}; +/*----------------------------------------------------------------------------*/ + +int CloseEpollSocket(mctx_t mctx, int epid); + +#endif /* EVENTPOLL_H */ diff --git a/lib/flash/mtcp/include/fhash.h b/lib/flash/mtcp/include/fhash.h new file mode 100644 index 0000000..c1e07f7 --- /dev/null +++ b/lib/flash/mtcp/include/fhash.h @@ -0,0 +1,79 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FHASH_H +#define FHASH_H + +#include +#include "tcp_stream.h" + +#define NUM_BINS_FLOWS (131072) /* 132 K entries per thread*/ +#define NUM_BINS_LISTENERS (1024) /* assuming that chaining won't happen excessively */ +#define TCP_AR_CNT (3) + +typedef struct hash_bucket_head { + tcp_stream *tqh_first; + tcp_stream **tqh_last; +} hash_bucket_head; + +typedef struct list_bucket_head { + struct tcp_listener *tqh_first; + struct tcp_listener **tqh_last; +} list_bucket_head; + +/* hashtable structure */ +struct hashtable { + uint32_t bins; + + union { + hash_bucket_head *ht_table; + list_bucket_head *lt_table; + }; + + // functions + unsigned int (*hashfn)(const void *); + int (*eqfn)(const void *, const void *); +}; + +/*functions for hashtable*/ +struct hashtable *CreateHashtable(unsigned int (*hashfn)(const void *), int (*eqfn)(const void *, const void *), int bins); +void DestroyHashtable(struct hashtable *ht); + +int StreamHTInsert(struct hashtable *ht, void *); +void *StreamHTRemove(struct hashtable *ht, void *); +void *StreamHTSearch(struct hashtable *ht, const void *); +unsigned int HashListener(const void *hbo_port_ptr); +int EqualListener(const void *hbo_port_ptr1, const void *hbo_port_ptr2); +int ListenerHTInsert(struct hashtable *ht, void *); +void *ListenerHTRemove(struct hashtable *ht, void *); +void *ListenerHTSearch(struct hashtable *ht, const void *); + +#endif /* FHASH_H */ diff --git a/lib/flash/mtcp/include/icmp.h b/lib/flash/mtcp/include/icmp.h new file mode 100644 index 0000000..b0ff02a --- /dev/null +++ b/lib/flash/mtcp/include/icmp.h @@ -0,0 +1,79 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ICMP_H +#define ICMP_H +/*----------------------------------------------------------------------------*/ +struct icmphdr { + uint8_t icmp_type; + uint8_t icmp_code; + uint16_t icmp_checksum; + union { + struct { + uint16_t icmp_id; + uint16_t icmp_sequence; + } echo; // ECHO | ECHOREPLY + struct { + uint16_t unused; + uint16_t nhop_mtu; + } dest; // DEST_UNREACH + } un; +}; +/*----------------------------------------------------------------------------*/ +/* getters and setters for ICMP fields */ +#define ICMP_ECHO_GET_ID(icmph) (icmph->un.echo.icmp_id) +#define ICMP_ECHO_GET_SEQ(icmph) (icmph->un.echo.icmp_sequence) +#define ICMP_DEST_UNREACH_GET_MTU(icmph) (icmph->un.dest.nhop_mtu) + +#define ICMP_ECHO_SET_ID(icmph, id) (icmph->un.echo.icmp_id = id) +#define ICMP_ECHO_SET_SEQ(icmph, seq) (icmph->un.echo.icmp_sequence = seq) + +void RequestICMP(mtcp_manager_t mtcp, uint32_t saddr, uint32_t daddr, uint16_t icmp_id, uint16_t icmp_seq, uint8_t *icmpd, + uint16_t len); + +int ProcessICMPPacket(mtcp_manager_t mtcp, struct iphdr *iph, int len); + +/* ICMP types */ +#define ICMP_ECHOREPLY 0 /* Echo Reply */ +#define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ +#define ICMP_SOURCE_QUENCH 4 /* Source Quench */ +#define ICMP_REDIRECT 5 /* Redirect (change route) */ +#define ICMP_ECHO 8 /* Echo Request */ +#define ICMP_TIME_EXCEEDED 11 /* Time Exceeded */ +#define ICMP_PARAMETERPROB 12 /* Parameter Problem */ +#define ICMP_TIMESTAMP 13 /* Timestamp Request */ +#define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */ +#define ICMP_INFO_REQUEST 15 /* Information Request */ +#define ICMP_INFO_REPLY 16 /* Information Reply */ +#define ICMP_ADDRESS 17 /* Address Mask Request */ +#define ICMP_ADDRESSREPLY 18 /* Address Mask Reply */ +/*----------------------------------------------------------------------------*/ +#endif /* ICMP_H */ diff --git a/lib/flash/mtcp/include/io_module.h b/lib/flash/mtcp/include/io_module.h new file mode 100644 index 0000000..216e26b --- /dev/null +++ b/lib/flash/mtcp/include/io_module.h @@ -0,0 +1,160 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef IO_MODULE_H +#define IO_MODULE_H +/*----------------------------------------------------------------------------*/ +/* for type def'ns */ +#include +/* for ps lib funcs */ +#include "ps.h" +/*----------------------------------------------------------------------------*/ +/** + * Declaration to soothe down the warnings + */ +struct mtcp_thread_context; +/** + * io_module_funcs - contains template for the various 10Gbps pkt I/O + * - libraries that can be adopted. + * + * load_module() : Used to set system-wide I/O module + * initialization. + * + * init_handle() : Used to initialize the driver library + * : Also use the context to create/initialize + * : a private packet I/O data structures. + * + * link_devices() : Used to add link(s) to the mtcp stack. + * Returns 0 on success; -1 on failure. + * + * release_pkt() : release the packet if mTCP does not need + * to process it (e.g. non-IPv4, non-TCP pkts). + * + * get_wptr() : retrieve the next empty pkt buffer for the + * application for packet writing. Returns + * ptr to pkt buffer. + * + * send_pkts() : transmit batch of packets via interface + * idx (=nif). + * Returns 0 on success; -1 on failure + * + * get_rptr() : retrieve next pkt for application for + * packet read. + * Returns ptr to pkt buffer. + * + * recv_pkts() : recieve batch of packets from the interface, + * ifidx. + * Returns no. of packets that are read from + * the iface. + * + * select() : for blocking I/O + * + * destroy_handle() : free up resources allocated during + * init_handle(). Normally called during + * process termination. + * + * dev_ioctl() : contains submodules for select drivers + * + */ +typedef struct io_module_func { + void (*load_module)(void); + void (*init_handle)(struct mtcp_thread_context *ctx); + int32_t (*link_devices)(struct mtcp_thread_context *ctx); + void (*release_pkt)(struct mtcp_thread_context *ctx, int ifidx, unsigned char *pkt_data, int len); + uint8_t *(*get_wptr)(struct mtcp_thread_context *ctx, int ifidx, uint16_t len); + int32_t (*send_pkts)(struct mtcp_thread_context *ctx, int nif); + uint8_t *(*get_rptr)(struct mtcp_thread_context *ctx, int ifidx, int index, uint16_t *len); + int32_t (*recv_pkts)(struct mtcp_thread_context *ctx, int ifidx); + int32_t (*select)(struct mtcp_thread_context *ctx); + void (*destroy_handle)(struct mtcp_thread_context *ctx); + int32_t (*dev_ioctl)(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp); +#ifndef DISABLE_AFXDP + void (*drop_pkts)(struct mtcp_thread_context *ctxt); +#endif +} io_module_func __attribute__((aligned(__WORDSIZE))); +/*----------------------------------------------------------------------------*/ +/* set I/O module context */ +int SetNetEnv(char *port_list, char *port_stat_list); + +/* retrive device-specific endian type */ +int FetchEndianType(void); +/*----------------------------------------------------------------------------*/ +/* ptr to the `running' I/O module context */ +extern io_module_func *current_iomodule_func; + +/* dev_ioctl related macros */ +#define PKT_TX_IP_CSUM 0x01 +#define PKT_TX_TCP_CSUM 0x02 +#define PKT_RX_TCP_LROSEG 0x03 +#define PKT_TX_TCPIP_CSUM 0x04 +#define PKT_RX_IP_CSUM 0x05 +#define PKT_RX_TCP_CSUM 0x06 +#define PKT_TX_TCPIP_CSUM_PEEK 0x07 +#define DRV_NAME 0x08 + +/* registered psio context */ +#ifdef DISABLE_PSIO +#define ps_list_devices(x) 0 +#endif +extern io_module_func ps_module_func; +extern struct ps_device devices[MAX_DEVICES]; + +/* registered dpdk context */ +extern io_module_func dpdk_module_func; + +/* registered netmap context */ +extern io_module_func netmap_module_func; + +/* registered onvm context */ +extern io_module_func onvm_module_func; + +/* registered afxdp context */ +extern io_module_func afxdp_module_func; + +/* check I/O module access permissions */ +int CheckIOModuleAccessPermissions(void); + +/* Macro to assign IO module */ +#define AssignIOModule(m) \ + { \ + if (!strcmp(m, "psio")) \ + current_iomodule_func = &ps_module_func; \ + else if (!strcmp(m, "dpdk")) \ + current_iomodule_func = &dpdk_module_func; \ + else if (!strcmp(m, "onvm")) \ + current_iomodule_func = &onvm_module_func; \ + else if (!strcmp(m, "afxdp")) \ + current_iomodule_func = &afxdp_module_func; \ + else \ + assert(0); \ + } +/*----------------------------------------------------------------------------*/ +#endif /* IO_MODULE_H */ diff --git a/lib/flash/mtcp/include/ip_in.h b/lib/flash/mtcp/include/ip_in.h new file mode 100644 index 0000000..9c1d5b7 --- /dev/null +++ b/lib/flash/mtcp/include/ip_in.h @@ -0,0 +1,39 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef IP_IN_H +#define IP_IN_H + +#include "mtcp.h" + +int ProcessIPv4Packet(mtcp_manager_t mtcp, uint32_t cur_ts, const int ifidx, unsigned char *pkt_data, int len); + +#endif /* IP_IN_H */ diff --git a/lib/flash/mtcp/include/ip_out.h b/lib/flash/mtcp/include/ip_out.h new file mode 100644 index 0000000..1035489 --- /dev/null +++ b/lib/flash/mtcp/include/ip_out.h @@ -0,0 +1,47 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef IP_OUT_H +#define IP_OUT_H + +#include +#include "tcp_stream.h" + +extern inline int GetOutputInterface(uint32_t daddr, uint8_t *is_external); + +void ForwardIPv4Packet(mtcp_manager_t mtcp, int nif_in, char *buf, int len); + +uint8_t *IPOutputStandalone(struct mtcp_manager *mtcp, uint8_t protocol, uint16_t ip_id, uint32_t saddr, uint32_t daddr, + uint16_t tcplen); + +uint8_t *IPOutput(struct mtcp_manager *mtcp, tcp_stream *stream, uint16_t tcplen); + +#endif /* IP_OUT_H */ diff --git a/lib/flash/mtcp/include/logger.h b/lib/flash/mtcp/include/logger.h new file mode 100644 index 0000000..8bff276 --- /dev/null +++ b/lib/flash/mtcp/include/logger.h @@ -0,0 +1,74 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LOGGER_H +#define LOGGER_H + +#include + +#define LOG_BUFF_SIZE (256 * 1024) +#define NUM_LOG_BUFF (100) + +extern enum { IDLE_LOGT, ACTIVE_LOGT } log_thread_state; + +typedef struct log_buff { + int tid; + FILE *fid; + int buff_len; + char buff[LOG_BUFF_SIZE]; + TAILQ_ENTRY(log_buff) buff_link; +} log_buff; + +typedef struct log_thread_context { + pthread_t thread; + int cpu; + int done; + int sp_fd; + int pair_sp_fd; + int free_buff_cnt; + int job_buff_cnt; + + uint8_t state; + + pthread_mutex_t mutex; + pthread_mutex_t free_mutex; + + TAILQ_HEAD(, log_buff) working_queue; + TAILQ_HEAD(, log_buff) free_queue; + +} log_thread_context; + +log_buff *DequeueFreeBuffer(log_thread_context *ctx); +void EnqueueJobBuffer(log_thread_context *ctx, log_buff *working_bp); +void InitLogThreadContext(log_thread_context *ctx, int cpu); +void *ThreadLogMain(void *arg); + +#endif /* LOGGER_H */ diff --git a/lib/flash/mtcp/include/memory_mgt.h b/lib/flash/mtcp/include/memory_mgt.h new file mode 100644 index 0000000..615b9a8 --- /dev/null +++ b/lib/flash/mtcp/include/memory_mgt.h @@ -0,0 +1,68 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MEMORY_MGT_H +#define MEMORY_MGT_H +/*----------------------------------------------------------------------------*/ +#if !defined(DISABLE_DPDK) && !defined(ENABLE_ONVM) +#include +#include +/* for rte_versions retrieval */ +#include +/*----------------------------------------------------------------------------*/ +typedef struct rte_mempool mem_pool; +typedef struct rte_mempool *mem_pool_t; +/* create a memory pool with a chunk size and total size + an return the pointer to the memory pool */ +mem_pool_t MPCreate(char *name, int chunk_size, size_t total_size); +/*----------------------------------------------------------------------------*/ +#else +struct mem_pool; +typedef struct mem_pool *mem_pool_t; + +/* create a memory pool with a chunk size and total size + an return the pointer to the memory pool */ +mem_pool_t MPCreate(int chunk_size, size_t total_size); +#endif /* DISABLE_DPDK */ +/*----------------------------------------------------------------------------*/ +/* allocate one chunk */ +void *MPAllocateChunk(mem_pool_t mp); + +/* free one chunk */ +void MPFreeChunk(mem_pool_t mp, void *p); + +/* destroy the memory pool */ +void MPDestroy(mem_pool_t mp); + +/* retrun the number of free chunks */ +int MPGetFreeChunks(mem_pool_t mp); +/*----------------------------------------------------------------------------*/ +#endif /* MEMORY_MGT_H */ diff --git a/lib/flash/mtcp/include/mtcp.h b/lib/flash/mtcp/include/mtcp.h new file mode 100644 index 0000000..fb7bf37 --- /dev/null +++ b/lib/flash/mtcp/include/mtcp.h @@ -0,0 +1,380 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MTCP_H +#define MTCP_H + +#include +#include +#include +#include +#include +#ifndef DISABLE_DPDK +#include +#endif + +#include "memory_mgt.h" +#include "tcp_ring_buffer.h" +#include "tcp_send_buffer.h" +#include "tcp_stream_queue.h" +#include "socket.h" +#include "mtcp_api.h" +#include "eventpoll.h" +#include "addr_pool.h" +#include "ps.h" +#include "logger.h" +#include "stat.h" +#include "io_module.h" + +#ifdef ENABLE_ONVM +#include "onvm_nflib.h" +#endif + +#ifndef USE_CCP +#define USE_CCP FALSE +#endif + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +#ifndef ERROR +#define ERROR (-1) +#endif + +#define ETHERNET_HEADER_LEN 14 // sizeof(struct ethhdr) +#define IP_HEADER_LEN 20 // sizeof(struct iphdr) +#define TCP_HEADER_LEN 20 // sizeof(struct tcphdr) +#define TOTAL_TCP_HEADER_LEN 54 // total header length + +/* configurations */ +#define BACKLOG_SIZE (10 * 1024) +#define MAX_PKT_SIZE (2 * 1024) +#define ETH_NUM MAX_DEVICES + +#define TCP_OPT_TIMESTAMP_ENABLED TRUE // enabled for rtt measure +#define TCP_OPT_SACK_ENABLED TRUE // only recv-side implemented + +/* Only use rate limiting if using CCP */ +#if USE_CCP +#undef RATE_LIMIT_ENABLED +#define RATE_LIMIT_ENABLED TRUE +#define PACING_ENABLED FALSE +// The following two logs are for debugging / experiments only, should be turned +// off for production use +// #define DBGCCP // ccp debug messages +// #define PROBECCP // print all cwnd changes, similar to tcpprobe output +#define CC_NAME 20 +#endif + +#ifndef LOCK_STREAM_QUEUE +#define LOCK_STREAM_QUEUE FALSE +#endif +#define USE_SPIN_LOCK TRUE +#define INTR_SLEEPING_MTCP TRUE +#define PROMISCUOUS_MODE TRUE + +/* blocking api became obsolete */ +#define BLOCKING_SUPPORT FALSE + +#ifndef MAX_CPUS +#define MAX_CPUS 16 +#endif +/*----------------------------------------------------------------------------*/ +/* Statistics */ +#ifdef NETSTAT +#define NETSTAT_PERTHREAD TRUE +#define NETSTAT_TOTAL TRUE +#endif /* NETSTAT */ +#define RTM_STAT FALSE +/*----------------------------------------------------------------------------*/ +/* Lock definitions for socket buffer */ +#if USE_SPIN_LOCK +#define SBUF_LOCK_INIT(lock, errmsg, action) \ + ; \ + if (pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE)) { \ + perror("pthread_spin_init" errmsg); \ + action; \ + } +#define SBUF_LOCK_DESTROY(lock) pthread_spin_destroy(lock) +#define SBUF_LOCK(lock) pthread_spin_lock(lock) +#define SBUF_UNLOCK(lock) pthread_spin_unlock(lock) +#else +#define SBUF_LOCK_INIT(lock, errmsg, action) \ + ; \ + if (pthread_mutex_init(lock, NULL)) { \ + perror("pthread_mutex_init" errmsg); \ + action; \ + } +#define SBUF_LOCK_DESTROY(lock) pthread_mutex_destroy(lock) +#define SBUF_LOCK(lock) pthread_mutex_lock(lock) +#define SBUF_UNLOCK(lock) pthread_mutex_unlock(lock) +#endif /* USE_SPIN_LOCK */ + +/* add macro if it is not defined in /usr/include/sys/queue.h */ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); (var) && ((tvar) = TAILQ_NEXT((var), field), 1); (var) = (tvar)) +#endif +/*----------------------------------------------------------------------------*/ +struct eth_table { + char dev_name[128]; + int ifindex; + int stat_print; + unsigned char haddr[ETH_ALEN]; + uint32_t netmask; + // unsigned char dst_haddr[ETH_ALEN]; + uint32_t ip_addr; +}; +/*----------------------------------------------------------------------------*/ +struct route_table { + uint32_t daddr; + uint32_t mask; + uint32_t masked; + int prefix; + int nif; +}; +/*----------------------------------------------------------------------------*/ +struct arp_entry { + uint32_t ip; + int8_t prefix; + uint32_t ip_mask; + uint32_t ip_masked; + unsigned char haddr[ETH_ALEN]; +}; +/*----------------------------------------------------------------------------*/ +struct arp_table { + struct arp_entry *entry; + struct arp_entry *gateway; + int entries; +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_config { + /* network interface config */ + struct eth_table *eths; + int *nif_to_eidx; // mapping physic port indexes to that of the configured port-list + int eths_num; + + /* route config */ + struct route_table *rtable; // routing table + struct route_table *gateway; + int routes; // # of entries + + /* arp config */ + struct arp_table arp; + + int num_cores; + int num_mem_ch; + int max_concurrency; +#ifndef DISABLE_DPDK + mpz_t _cpumask; +#endif + + int max_num_buffers; + int rcvbuf_size; + int sndbuf_size; + + int tcp_timewait; + int tcp_timeout; + + /* adding multi-process support */ + uint8_t multi_process; + uint8_t multi_process_is_master; + +#ifdef ENABLE_ONVM + struct onvm_nf_local_ctx *nf_local_ctx; + /* onvm specific args */ + uint16_t onvm_serv; + uint16_t onvm_inst; + uint16_t onvm_dest; +#endif +#if USE_CCP + char cc[CC_NAME]; +#endif +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_context { + int cpu; +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_sender { + int ifidx; + + /* TCP layer send queues */ + TAILQ_HEAD(control_head, tcp_stream) control_list; + TAILQ_HEAD(send_head, tcp_stream) send_list; + TAILQ_HEAD(ack_head, tcp_stream) ack_list; + + int control_list_cnt; + int send_list_cnt; + int ack_list_cnt; +}; +/*----------------------------------------------------------------------------*/ +struct mtcp_manager { + mem_pool_t flow_pool; /* memory pool for tcp_stream */ + mem_pool_t rv_pool; /* memory pool for recv variables */ + mem_pool_t sv_pool; /* memory pool for send variables */ + mem_pool_t mv_pool; /* memory pool for monitor variables */ + + //mem_pool_t socket_pool; + sb_manager_t rbm_snd; + rb_manager_t rbm_rcv; + struct hashtable *tcp_flow_table; +#if USE_CCP + struct hashtable *tcp_sid_table; +#endif + + uint32_t s_index : 24; /* stream index */ + socket_map_t smap; + TAILQ_HEAD(, socket_map) free_smap; + + addr_pool_t ap; /* address pool */ + + uint32_t g_id; /* id space in a thread */ + uint32_t flow_cnt; /* number of concurrent flows */ + + struct mtcp_thread_context *ctx; + + /* variables related to logger */ + int sp_fd; + log_thread_context *logger; + log_buff *w_buffer; + FILE *log_fp; + + /* variables related to event */ + struct mtcp_epoll *ep; + uint32_t ts_last_event; + + struct hashtable *listeners; + + stream_queue_t connectq; /* streams need to connect */ + stream_queue_t sendq; /* streams need to send data */ + stream_queue_t ackq; /* streams need to send ack */ + + stream_queue_t closeq; /* streams need to close */ + stream_queue_int *closeq_int; /* internally maintained closeq */ + stream_queue_t resetq; /* streams need to reset */ + stream_queue_int *resetq_int; /* internally maintained resetq */ + + stream_queue_t destroyq; /* streams need to be destroyed */ + + struct mtcp_sender *g_sender; + struct mtcp_sender *n_sender[ETH_NUM]; + + /* lists related to timeout */ + struct rto_hashstore *rto_store; + TAILQ_HEAD(timewait_head, tcp_stream) timewait_list; + TAILQ_HEAD(timeout_head, tcp_stream) timeout_list; + + int rto_list_cnt; + int timewait_list_cnt; + int timeout_list_cnt; + +#if BLOCKING_SUPPORT + TAILQ_HEAD(rcv_br_head, tcp_stream) rcv_br_list; + TAILQ_HEAD(snd_br_head, tcp_stream) snd_br_list; + int rcv_br_list_cnt; + int snd_br_list_cnt; +#endif + + uint32_t cur_ts; + + int wakeup_flag; + int is_sleeping; + + /* statistics */ + struct bcast_stat bstat; + struct timeout_stat tstat; +#ifdef NETSTAT + struct net_stat nstat; + struct net_stat p_nstat; + uint32_t p_nstat_ts; + + struct run_stat runstat; + struct run_stat p_runstat; + + struct time_stat rtstat; +#endif /* NETSTAT */ + struct io_module_func *iom; + +#if USE_CCP + int from_ccp; + int to_ccp; +#endif +}; +/*----------------------------------------------------------------------------*/ +typedef struct mtcp_manager *mtcp_manager_t; +/*----------------------------------------------------------------------------*/ +mtcp_manager_t GetMTCPManager(mctx_t mctx); +/*----------------------------------------------------------------------------*/ +struct mtcp_thread_context { + int cpu; + pthread_t thread; + uint8_t done : 1, exit : 1, interrupt : 1; + + struct mtcp_manager *mtcp_manager; + + void *io_private_context; + pthread_mutex_t smap_lock; + pthread_mutex_t flow_pool_lock; + pthread_mutex_t socket_pool_lock; + +#if LOCK_STREAM_QUEUE +#if USE_SPIN_LOCK + pthread_spinlock_t connect_lock; + pthread_spinlock_t close_lock; + pthread_spinlock_t reset_lock; + pthread_spinlock_t sendq_lock; + pthread_spinlock_t ackq_lock; + pthread_spinlock_t destroyq_lock; +#else + pthread_mutex_t connect_lock; + pthread_mutex_t close_lock; + pthread_mutex_t reset_lock; + pthread_mutex_t sendq_lock; + pthread_mutex_t ackq_lock; + pthread_mutex_t destroyq_lock; +#endif /* USE_SPIN_LOCK */ +#endif /* LOCK_STREAM_QUEUE */ +}; +/*----------------------------------------------------------------------------*/ +typedef struct mtcp_thread_context *mtcp_thread_context_t; +/*----------------------------------------------------------------------------*/ +extern struct mtcp_manager *g_mtcp[MAX_CPUS]; +extern struct mtcp_config CONFIG; +extern addr_pool_t ap[ETH_NUM]; +/*----------------------------------------------------------------------------*/ + +#endif /* MTCP_H */ diff --git a/lib/flash/mtcp/include/mtcp_api.h b/lib/flash/mtcp/include/mtcp_api.h new file mode 100644 index 0000000..ecfce3f --- /dev/null +++ b/lib/flash/mtcp/include/mtcp_api.h @@ -0,0 +1,144 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MTCP_API_H +#define MTCP_API_H + +#include +#include +#include + +#ifndef UNUSED +#define UNUSED(x) (void)x +#endif + +#ifndef INPORT_ANY +#define INPORT_ANY (uint16_t)0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +enum socket_type { + MTCP_SOCK_UNUSED, + MTCP_SOCK_STREAM, + MTCP_SOCK_PROXY, + MTCP_SOCK_LISTENER, + MTCP_SOCK_EPOLL, + MTCP_SOCK_PIPE, +}; + +struct mtcp_conf { + int num_cores; + int max_concurrency; + + int max_num_buffers; + int rcvbuf_size; + int sndbuf_size; + + int tcp_timewait; + int tcp_timeout; +}; + +typedef struct mtcp_context *mctx_t; + +int mtcp_init(const char *config_file); + +void mtcp_destroy(void); + +int mtcp_getconf(struct mtcp_conf *conf); + +int mtcp_setconf(const struct mtcp_conf *conf); + +int mtcp_core_affinitize(int cpu); + +mctx_t mtcp_create_context(int cpu); + +void mtcp_destroy_context(mctx_t mctx); + +typedef void (*mtcp_sighandler_t)(int); + +mtcp_sighandler_t mtcp_register_signal(int signum, mtcp_sighandler_t handler); + +int mtcp_pipe(mctx_t mctx, int pipeid[2]); + +int mtcp_getsockopt(mctx_t mctx, int sockid, int level, int optname, void *optval, socklen_t *optlen); + +int mtcp_setsockopt(mctx_t mctx, int sockid, int level, int optname, const void *optval, socklen_t optlen); + +int mtcp_setsock_nonblock(mctx_t mctx, int sockid); + +/* mtcp_socket_ioctl: similar to ioctl, + but only FIONREAD is supported currently */ +int mtcp_socket_ioctl(mctx_t mctx, int sockid, int request, void *argp); + +int mtcp_socket(mctx_t mctx, int domain, int type, int protocol); + +int mtcp_bind(mctx_t mctx, int sockid, const struct sockaddr *addr, socklen_t addrlen); + +int mtcp_listen(mctx_t mctx, int sockid, int backlog); + +int mtcp_accept(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen); + +int mtcp_init_rss(mctx_t mctx, in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_addr_t dport); + +int mtcp_connect(mctx_t mctx, int sockid, const struct sockaddr *addr, socklen_t addrlen); + +int mtcp_close(mctx_t mctx, int sockid); + +/** Returns the current address to which the socket sockfd is bound + * @param [in] mctx: mtcp context + * @param [in] addr: address buffer to be filled + * @param [in] addrlen: amount of space pointed to by addr + * @return 0 on success, -1 on error + */ +int mtcp_getsockname(mctx_t mctx, int sock, struct sockaddr *addr, socklen_t *addrlen); + +int mtcp_getpeername(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen); + +inline ssize_t mtcp_read(mctx_t mctx, int sockid, char *buf, size_t len); + +ssize_t mtcp_recv(mctx_t mctx, int sockid, char *buf, size_t len, int flags); + +/* readv should work in atomic */ +int mtcp_readv(mctx_t mctx, int sockid, const struct iovec *iov, int numIOV); + +ssize_t mtcp_write(mctx_t mctx, int sockid, const char *buf, size_t len); + +/* writev should work in atomic */ +int mtcp_writev(mctx_t mctx, int sockid, const struct iovec *iov, int numIOV); + +#ifdef __cplusplus +}; +#endif + +#endif /* MTCP_API_H */ diff --git a/lib/flash/mtcp/include/mtcp_epoll.h b/lib/flash/mtcp/include/mtcp_epoll.h new file mode 100644 index 0000000..9d97f48 --- /dev/null +++ b/lib/flash/mtcp/include/mtcp_epoll.h @@ -0,0 +1,92 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MTCP_EPOLL_H +#define MTCP_EPOLL_H + +#include "mtcp_api.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------*/ +enum mtcp_epoll_op { + MTCP_EPOLL_CTL_ADD = 1, + MTCP_EPOLL_CTL_DEL = 2, + MTCP_EPOLL_CTL_MOD = 3, +}; +/*----------------------------------------------------------------------------*/ +enum mtcp_event_type { + MTCP_EPOLLNONE = 0x000, + MTCP_EPOLLIN = 0x001, + MTCP_EPOLLPRI = 0x002, + MTCP_EPOLLOUT = 0x004, + MTCP_EPOLLRDNORM = 0x040, + MTCP_EPOLLRDBAND = 0x080, + MTCP_EPOLLWRNORM = 0x100, + MTCP_EPOLLWRBAND = 0x200, + MTCP_EPOLLMSG = 0x400, + MTCP_EPOLLERR = 0x008, + MTCP_EPOLLHUP = 0x010, + MTCP_EPOLLRDHUP = 0x2000, + MTCP_EPOLLONESHOT = (1 << 30), + MTCP_EPOLLET = (1 << 31) +}; +/*----------------------------------------------------------------------------*/ +typedef union mtcp_epoll_data { + void *ptr; + int sockid; + uint32_t u32; + uint64_t u64; +} mtcp_epoll_data_t; +/*----------------------------------------------------------------------------*/ +struct mtcp_epoll_event { + uint32_t events; + mtcp_epoll_data_t data; +}; +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_create(mctx_t mctx, int size); +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_create1(mctx_t mctx, int flags); +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_ctl(mctx_t mctx, int epid, int op, int sockid, struct mtcp_epoll_event *event); +/*----------------------------------------------------------------------------*/ +int mtcp_epoll_wait(mctx_t mctx, int epid, struct mtcp_epoll_event *events, int maxevents, int timeout); +/*----------------------------------------------------------------------------*/ +const char *EventToString(uint32_t event); +/*----------------------------------------------------------------------------*/ + +#ifdef __cplusplus +}; +#endif + +#endif /* MTCP_EPOLL_H */ diff --git a/lib/flash/mtcp/include/netmap.h b/lib/flash/mtcp/include/netmap.h new file mode 100644 index 0000000..8bcb992 --- /dev/null +++ b/lib/flash/mtcp/include/netmap.h @@ -0,0 +1,640 @@ +/* + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $ + * + * Definitions of constants and the structures used by the netmap + * framework, for the part visible to both kernel and userspace. + * Detailed info on netmap is available with "man netmap" or at + * + * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch + */ + +#ifndef _NET_NETMAP_H_ +#define _NET_NETMAP_H_ + +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + +/* + * --- Netmap data structures --- + * + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + + KERNEL (opaque, obviously) + + ==================================================================== + | + USERSPACE | struct netmap_ring + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ + | rxring_ofs[1] | + (rx+1 entries) + | rxring_ofs[r] | + +---------------+ + + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for non-NIC ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * + * In user space, the buffer address is computed as + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does not need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side + */ + +/* + * struct netmap_slot is a buffer descriptor + */ +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* length for this slot */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + * + * It is also set by the kernel whenever the buf_idx is + * changed internally (e.g., by pipes). Applications may + * use this information to know when they can reuse the + * contents of previously prepared buffers. + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + +#define NS_PORT_SHIFT 8 +#define NS_PORT_MASK (0xff << NS_PORT_SHIFT) +/* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + +#define NS_RFRAGS(_slot) (((_slot)->flags >> 8) & 0xff) +/* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + +/* + * struct netmap_ring + * + * Netmap representation of a TX or RX ring (also known as "queue"). + * This is a queue implemented as a fixed-size circular array. + * At the software level the important fields are: head, cur, tail. + * + * In TX rings: + * + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. XXX todo (2014-03-12) + * + * In RX rings: + * + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. + * + * DATA OWNERSHIP/LOCKING: + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel + */ +struct netmap_ring { + /* + * buf_ofs is meant to be used through macros. + * It contains the offset of the buffer region from this + * descriptor. + */ + const int64_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ + + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ + + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ +#if !defined(_WIN32) || defined(__CYGWIN__) + uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; +#else + uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; +#endif + + /* the slots follow. This struct has variable size */ + struct netmap_slot slot[0]; /* array of slots. */ +}; + +/* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + +/* + * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * + * There is one netmap_if for each file descriptor on which we want + * to select/poll. + * select/poll operates on one or all pairs depending on the value of + * nmr_queueid passed on the ioctl. + */ +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; + /* + * The following array contains the offset of each netmap ring + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * + * The area is filled up by the kernel on NIOCREGIF, + * and then only read by userspace code. + */ + const ssize_t ring_ofs[0]; +}; + +#ifndef NIOCREGIF +/* + * ioctl names and related fields + * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * + * NIOCGINFO takes a struct ifreq, the interface name is the input, + * the outputs are number of queues and number of descriptor + * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. + * + * NIOCREGIF takes an interface name within a struct nmre, + * and activates netmap mode on the interface (if possible). + * + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. + * + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: + * + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. + * + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. + * + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings + * + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. + * + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * NETMAP_BDG_NEWIF + * create a persistent VALE port with name nr_name. + * Used by vale-ctl -n ... + * + * NETMAP_BDG_DELIF + * delete a persistent VALE port. Used by vale-ctl -d ... + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * + * + */ + +/* + * struct nmreq overlays a struct ifreq (just the name) + */ +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_version; /* API version */ + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_tx_slots; /* slots in tx rings */ + uint32_t nr_rx_slots; /* slots in rx rings */ + uint16_t nr_tx_rings; /* number of tx rings */ + uint16_t nr_rx_rings; /* number of rx rings */ + + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + +#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + + uint16_t nr_cmd; +#define NETMAP_BDG_ATTACH 1 /* attach the NIC */ +#define NETMAP_BDG_DETACH 2 /* detach the NIC */ +#define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */ +#define NETMAP_BDG_LIST 4 /* get bridge's info */ +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ +#define NETMAP_BDG_NEWIF 6 /* create a virtual port */ +#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */ +#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */ +#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */ +#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ +#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ +#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ +#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + + uint16_t nr_arg2; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { + NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, +}; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 +#define NR_ZCOPY_MON 0x400 +/* request exclusive access to the selected rings */ +#define NR_EXCLUSIVE 0x800 +/* request ptnetmap host support */ +#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */ +#define NR_PTNETMAP_HOST 0x1000 +#define NR_RX_RINGS_ONLY 0x2000 +#define NR_TX_RINGS_ONLY 0x4000 +/* Applications set this flag if they are able to deal with virtio-net headers, + * that is send/receive frames that start with a virtio-net header. + * If not set, NIOCREGIF will fail with netmap ports that require applications + * to use those headers. If the flag is set, the application can use the + * NETMAP_VNET_HDR_GET command to figure out the header length. */ +#define NR_ACCEPT_VNET_HDR 0x8000 + +/* + * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined + * in ws2def.h but not sure if they are in the form we need. + * XXX so we redefine them + * in a convenient way to use for DeviceIoControl signatures + */ +#ifdef _WIN32 +#undef _IO // ws2def.h +#define _WIN_NM_IOCTL_TYPE 40000 +#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800), METHOD_BUFFERED, FILE_ANY_ACCESS) +#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800), METHOD_OUT_DIRECT, FILE_ANY_ACCESS) + +#define _IOWR(_c, _n, _s) _IO(_c, _n) + +/* We havesome internal sysctl in addition to the externally visible ones */ +#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT +#define NETMAP_POLL _IO('i', 162) + +/* and also two setsockopt for sysctl emulation */ +#define NETMAP_SETSOCKOPT _IO('i', 140) +#define NETMAP_GETSOCKOPT _IO('i', 141) + +//These linknames are for the Netmap Core Driver +#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP" +#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap" + +//Definition of a structure used to pass a virtual address within an IOCTL +typedef struct _MEMORY_ENTRY { + PVOID pUsermodeVirtualAddress; +} MEMORY_ENTRY, *PMEMORY_ENTRY; + +typedef struct _POLL_REQUEST_DATA { + int events; + int timeout; + int revents; +} POLL_REQUEST_DATA; + +#endif /* _WIN32 */ + +/* + * FreeBSD uses the size value embedded in the _IOWR to determine + * how much to copy in/out. So we need it to match the actual + * data structure we pass. We put some spares in the structure + * to ease compatibility with other versions + */ +#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ +#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ +#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ +#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ +#define NIOCCONFIG _IOWR('i', 150, struct nm_ifreq) /* for ext. modules */ +#endif /* !NIOCREGIF */ + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + +/* + * Opaque structure that is passed to an external kernel + * module via ioctl(fd, NIOCCONFIG, req) for a user-owned + * bridge port (at this point ephemeral VALE interface). + */ +#define NM_IFRDATA_LEN 256 +struct nm_ifreq { + char nifr_name[IFNAMSIZ]; + char data[NM_IFRDATA_LEN]; +}; + +/* + * netmap kernel thread configuration + */ +/* bhyve/vmm.ko MSIX parameters for IOCTL */ +struct ptn_vmm_ioctl_msix { + uint64_t msg; + uint64_t addr; +}; + +/* IOCTL parameters */ +struct nm_kth_ioctl { + u_long com; + /* TODO: use union */ + union { + struct ptn_vmm_ioctl_msix msix; + } data; +}; + +/* Configuration of a ptnetmap ring */ +struct ptnet_ring_cfg { + uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */ + uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */ + struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */ +}; +#endif /* _NET_NETMAP_H_ */ diff --git a/lib/flash/mtcp/include/netmap_user.h b/lib/flash/mtcp/include/netmap_user.h new file mode 100644 index 0000000..1cb0596 --- /dev/null +++ b/lib/flash/mtcp/include/netmap_user.h @@ -0,0 +1,968 @@ +/* + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. + * + * The address of the struct netmap_if, say nifp, is computed from the + * value returned from ioctl(.., NIOCREG, ...) and the mmap region: + * ioctl(fd, NIOCREG, &req); + * mem = mmap(0, ... ); + * nifp = NETMAP_IF(mem, req.nr_nifp); + * (so simple, we could just do it manually) + * + * From there: + * struct netmap_ring *NETMAP_TXRING(nifp, index) + * struct netmap_ring *NETMAP_RXRING(nifp, index) + * we can access ring->cur, ring->head, ring->tail, etc. + * + * ring->slot[i] gives us the i-th slot (we can access + * directly len, flags, buf_idx) + * + * char *buf = NETMAP_BUF(ring, x) returns a pointer to + * the buffer numbered x + * + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); + * + * To ease porting apps from pcap to netmap we supply a few fuctions + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. + * + * In order to use these, include #define NETMAP_WITH_LIBS + * in the source file that invokes these functions. + */ + +#ifndef _NET_NETMAP_USER_H_ +#define _NET_NETMAP_USER_H_ + +#define NETMAP_DEVICE_NAME "/dev/netmap" +#ifdef __CYGWIN__ +/* + * we can compile userspace apps with either cygwin or msvc, + * and we use _WIN32 to identify windows specific code + */ +#ifndef _WIN32 +#define _WIN32 +#endif /* _WIN32 */ + +#endif /* __CYGWIN__ */ + +#ifdef _WIN32 +#undef NETMAP_DEVICE_NAME +#define NETMAP_DEVICE_NAME "/proc/sys/DosDevices/Global/netmap" +#include +#include +#include +//#include +//#include +//#define IFNAMSIZ 256 +#endif + +#include +#include /* apple needs sockaddr */ +#include /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + +#include "netmap.h" + +/* helper macro */ +#define _NETMAP_OFFSET(type, ptr, offset) ((type)(void *)((char *)(ptr) + (offset))) + +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) + +#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, nifp, (nifp)->ring_ofs[index]) + +#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1]) + +#define NETMAP_BUF(ring, index) ((char *)(ring) + (ring)->buf_ofs + ((index) * (ring)->nr_buf_size)) + +#define NETMAP_BUF_IDX(ring, buf) (((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs)) / (ring)->nr_buf_size) + +static inline uint32_t nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return (unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + +/* + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) + */ +static inline int nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->head; +} + +static inline uint32_t nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} + +#ifdef NETMAP_WITH_LIBS +/* + * Support for simple I/O libraries. + * Include other system headers required for compiling this. + */ + +#ifndef HAVE_NETMAP_WITH_LIBS +#define HAVE_NETMAP_WITH_LIBS + +#include +#include +#include +#include /* memset */ +#include +#include /* EINVAL */ +#include /* O_RDWR */ +#include /* close() */ +#include +#include + +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) \ + do { \ + } while (0) +#define D(_fmt, ...) \ + do { \ + struct timeval _t0; \ + gettimeofday(&_t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", (int)(_t0.tv_sec % 1000), (int)_t0.tv_usec, __FUNCTION__, __LINE__, \ + ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int __t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (__t0 != __xxts.tv_sec) { \ + __t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ + struct timeval ts; + uint32_t caplen; + uint32_t len; +}; + +struct nm_stat { /* same as pcap_stat */ + u_int ps_recv; + u_int ps_drop; + u_int ps_ifdrop; +#ifdef WIN32 + u_int bs_capt; +#endif /* WIN32 */ +}; + +#define NM_ERRBUF_SIZE 512 + +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ + int fd; + void *mem; + uint32_t memsize; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if *const nifp; + uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; + uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; + struct nmreq req; /* also contains the nr_name = ifname */ + struct nm_pkthdr hdr; + + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring *const some_ring; + void *const buf_start; + void *const buf_end; + /* parameters from pcap_open_live */ + int snaplen; + int promisc; + int to_ms; + char *errbuf; + + /* save flags so we can restore them on close */ + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct nm_stat st; + char msg[NM_ERRBUF_SIZE]; +}; + +/* + * when the descriptor is open correctly, d->self == d + * Eventually we should also use some magic number. + */ +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) +#define NETMAP_FD(d) (P2NMD(d)->fd) + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void nm_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = (const uint64_t *)_src; + uint64_t *dst = (uint64_t *)_dst; + + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l -= 64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +/* + * The callback, invoked on each received packet. Same as libpcap + */ +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); + +/* + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * a suffix starting with / and the following flags, + * in any order: + * x exclusive access + * z zero copy monitor + * t monitor tx side + * r monitor rx side + * R bind only RX ring(s) + * T bind only TX ring(s) + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. + */ + +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + +/* + * nm_close() closes and restores the port to its previous state + */ + +static int nm_close(struct nm_desc *); + +/* + * nm_mmap() do mmap or inherit from parent if the nr_arg2 + * (memory block) matches. + */ + +static int nm_mmap(struct nm_desc *, const struct nm_desc *); + +/* + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() + */ + +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); + +#ifdef _WIN32 + +intptr_t _get_osfhandle(int); /* defined in io.h in windows */ + +/* + * In windows we do not have yet native poll support, so we keep track + * of file descriptors associated to netmap ports to emulate poll on + * them and fall back on regular poll on other file descriptors. + */ +struct win_netmap_fd_list { + struct win_netmap_fd_list *next; + int win_netmap_fd; + HANDLE win_netmap_handle; +}; + +/* + * list head containing all the netmap opened fd and their + * windows HANDLE counterparts + */ +static struct win_netmap_fd_list *win_netmap_fd_list_head; + +static void win_insert_fd_record(int fd) +{ + struct win_netmap_fd_list *curr; + + for (curr = win_netmap_fd_list_head; curr; curr = curr->next) { + if (fd == curr->win_netmap_fd) { + return; + } + } + curr = calloc(1, sizeof(*curr)); + curr->next = win_netmap_fd_list_head; + curr->win_netmap_fd = fd; + curr->win_netmap_handle = IntToPtr(_get_osfhandle(fd)); + win_netmap_fd_list_head = curr; +} + +void win_remove_fd_record(int fd) +{ + struct win_netmap_fd_list *curr = win_netmap_fd_list_head; + struct win_netmap_fd_list *prev = NULL; + for (; curr; prev = curr, curr = curr->next) { + if (fd != curr->win_netmap_fd) + continue; + /* found the entry */ + if (prev == NULL) { /* we are freeing the first entry */ + win_netmap_fd_list_head = curr->next; + } else { + prev->next = curr->next; + } + free(curr); + break; + } +} + +HANDLE +win_get_netmap_handle(int fd) +{ + struct win_netmap_fd_list *curr; + + for (curr = win_netmap_fd_list_head; curr; curr = curr->next) { + if (fd == curr->win_netmap_fd) { + return curr->win_netmap_handle; + } + } + return NULL; +} + +/* + * we need to wrap ioctl and mmap, at least for the netmap file descriptors + */ + +/* + * use this function only from netmap_user.h internal functions + * same as ioctl, returns 0 on success and -1 on error + */ +static int win_nm_ioctl_internal(HANDLE h, int32_t ctlCode, void *arg) +{ + DWORD bReturn = 0, szIn, szOut; + BOOL ioctlReturnStatus; + void *inParam = arg, *outParam = arg; + + switch (ctlCode) { + case NETMAP_POLL: + szIn = sizeof(POLL_REQUEST_DATA); + szOut = sizeof(POLL_REQUEST_DATA); + break; + case NETMAP_MMAP: + szIn = 0; + szOut = sizeof(void *); + inParam = NULL; /* nothing on input */ + break; + case NIOCTXSYNC: + case NIOCRXSYNC: + szIn = 0; + szOut = 0; + break; + case NIOCREGIF: + szIn = sizeof(struct nmreq); + szOut = sizeof(struct nmreq); + break; + case NIOCCONFIG: + D("unsupported NIOCCONFIG!"); + return -1; + + default: /* a regular ioctl */ + D("invalid ioctl %x on netmap fd", ctlCode); + return -1; + } + + ioctlReturnStatus = DeviceIoControl(h, ctlCode, inParam, szIn, outParam, szOut, &bReturn, NULL); + // XXX note windows returns 0 on error or async call, 1 on success + // we could call GetLastError() to figure out what happened + return ioctlReturnStatus ? 0 : -1; +} + +/* + * this function is what must be called from user-space programs + * same as ioctl, returns 0 on success and -1 on error + */ +static int win_nm_ioctl(int fd, int32_t ctlCode, void *arg) +{ + HANDLE h = win_get_netmap_handle(fd); + + if (h == NULL) { + return ioctl(fd, ctlCode, arg); + } else { + return win_nm_ioctl_internal(h, ctlCode, arg); + } +} + +#define ioctl win_nm_ioctl /* from now on, within this file ... */ + +/* + * We cannot use the native mmap on windows + * The only parameter used is "fd", the other ones are just declared to + * make this signature comparable to the FreeBSD/Linux one + */ +static void *win32_mmap_emulated(void *addr, size_t length, int prot, int flags, int fd, int32_t offset) +{ + HANDLE h = win_get_netmap_handle(fd); + + if (h == NULL) { + return mmap(addr, length, prot, flags, fd, offset); + } else { + MEMORY_ENTRY ret; + + return win_nm_ioctl_internal(h, NETMAP_MMAP, &ret) ? NULL : ret.pUsermodeVirtualAddress; + } +} + +#define mmap win32_mmap_emulated + +#include /* XXX needed to use the structure pollfd */ + +static int win_nm_poll(struct pollfd *fds, int nfds, int timeout) +{ + HANDLE h; + + if (nfds != 1 || fds == NULL || (h = win_get_netmap_handle(fds->fd)) == NULL) { + ; + return poll(fds, nfds, timeout); + } else { + POLL_REQUEST_DATA prd; + + prd.timeout = timeout; + prd.events = fds->events; + + win_nm_ioctl_internal(h, NETMAP_POLL, &prd); + if ((prd.revents == POLLERR) || (prd.revents == STATUS_TIMEOUT)) { + return -1; + } + return 1; + } +} + +#define poll win_nm_poll + +static int win_nm_open(char *pathname, int flags) +{ + if (strcmp(pathname, NETMAP_DEVICE_NAME) == 0) { + int fd = open(NETMAP_DEVICE_NAME, O_RDWR); + if (fd < 0) { + return -1; + } + + win_insert_fd_record(fd); + return fd; + } else { + return open(pathname, flags); + } +} + +#define open win_nm_open + +static int win_nm_close(int fd) +{ + if (fd != -1) { + close(fd); + if (win_get_netmap_handle(fd) != NULL) { + win_remove_fd_record(fd); + } + } + return 0; +} + +#define close win_nm_close + +#endif /* _WIN32 */ + +/* + * Try to open, return descriptor if successful, NULL otherwise. + * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only XXX avoid mmap + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg + */ +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, uint64_t new_flags, const struct nm_desc *arg) +{ + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags, nr_reg; + const char *port = NULL; +#define MAXERRMSG 80 + char errmsg[MAXERRMSG] = ""; + enum { P_START, P_RNGSFXOK, P_GETNUM, P_FLAGS, P_FLAGSOK } p_state; + long num; + + if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { + errno = 0; /* name not recognised, not an error */ + return NULL; + } + if (ifname[0] == 'n') + ifname += 7; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}/", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + snprintf(errmsg, MAXERRMSG, "name too long"); + goto fail; + } + p_state = P_START; + nr_flags = NR_REG_ALL_NIC; /* default for no suffix */ + while (*port) { + switch (p_state) { + case P_START: + switch (*port) { + case '^': /* only SW ring */ + nr_flags = NR_REG_SW; + p_state = P_RNGSFXOK; + break; + case '*': /* NIC and SW */ + nr_flags = NR_REG_NIC_SW; + p_state = P_RNGSFXOK; + break; + case '-': /* one NIC ring pair */ + nr_flags = NR_REG_ONE_NIC; + p_state = P_GETNUM; + break; + case '{': /* pipe (master endpoint) */ + nr_flags = NR_REG_PIPE_MASTER; + p_state = P_GETNUM; + break; + case '}': /* pipe (slave endoint) */ + nr_flags = NR_REG_PIPE_SLAVE; + p_state = P_GETNUM; + break; + case '/': /* start of flags */ + p_state = P_FLAGS; + break; + default: + snprintf(errmsg, MAXERRMSG, "unknown modifier: '%c'", *port); + goto fail; + } + port++; + break; + case P_RNGSFXOK: + switch (*port) { + case '/': + p_state = P_FLAGS; + break; + default: + snprintf(errmsg, MAXERRMSG, "unexpected character: '%c'", *port); + goto fail; + } + port++; + break; + case P_GETNUM: + num = strtol(port, (char **)&port, 10); + if (num < 0 || num >= NETMAP_RING_MASK) { + snprintf(errmsg, MAXERRMSG, "'%ld' out of range [0, %d)", num, NETMAP_RING_MASK); + goto fail; + } + nr_ringid = num & NETMAP_RING_MASK; + p_state = P_RNGSFXOK; + break; + case P_FLAGS: + case P_FLAGSOK: + switch (*port) { + case 'x': + nr_flags |= NR_EXCLUSIVE; + break; + case 'z': + nr_flags |= NR_ZCOPY_MON; + break; + case 't': + nr_flags |= NR_MONITOR_TX; + break; + case 'r': + nr_flags |= NR_MONITOR_RX; + break; + case 'R': + nr_flags |= NR_RX_RINGS_ONLY; + break; + case 'T': + nr_flags |= NR_TX_RINGS_ONLY; + break; + default: + snprintf(errmsg, MAXERRMSG, "unrecognized flag: '%c'", *port); + goto fail; + } + port++; + p_state = P_FLAGSOK; + break; + } + } + if (p_state != P_START && p_state != P_RNGSFXOK && p_state != P_FLAGSOK) { + snprintf(errmsg, MAXERRMSG, "unexpected end of port name"); + goto fail; + } + ND("flags: %s %s %s %s", (nr_flags & NR_EXCLUSIVE) ? "EXCLUSIVE" : "", (nr_flags & NR_ZCOPY_MON) ? "ZCOPY_MON" : "", + (nr_flags & NR_MONITOR_TX) ? "MONITOR_TX" : "", (nr_flags & NR_MONITOR_RX) ? "MONITOR_RX" : ""); + d = (struct nm_desc *)calloc(1, sizeof(*d)); + if (d == NULL) { + snprintf(errmsg, MAXERRMSG, "nm_desc alloc failure"); + errno = ENOMEM; + return NULL; + } + d->self = d; /* set this early so nm_close() works */ + d->fd = open(NETMAP_DEVICE_NAME, O_RDWR); + if (d->fd < 0) { + snprintf(errmsg, MAXERRMSG, "cannot open /dev/netmap: %s", strerror(errno)); + goto fail; + } + + if (req) + d->req = *req; + d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags |= nr_flags; + memcpy(d->req.nr_name, ifname, namelen); + d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } + /* add the *XPOLL flags */ + d->req.nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + if (ioctl(d->fd, NIOCREGIF, &d->req)) { + snprintf(errmsg, MAXERRMSG, "NIOCREGIF failed: %s", strerror(errno)); + goto fail; + } + + /* if parent is defined, do nm_mmap() even if NM_OPEN_NO_MMAP is set */ + if ((!(new_flags & NM_OPEN_NO_MMAP) || parent) && nm_mmap(d, parent)) { + snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno)); + goto fail; + } + + nr_reg = d->req.nr_flags & NR_REG_MASK; + + if (nr_reg == NR_REG_SW) { /* host stack */ + d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; + d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_reg == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings - 1; + d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_reg == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_reg == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = d->first_rx_ring = d->last_rx_ring = d->req.nr_ringid & NETMAP_RING_MASK; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; + } + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, d->first_rx_ring, + d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + } +#endif /* debugging */ + + d->cur_tx_ring = d->first_tx_ring; + d->cur_rx_ring = d->first_rx_ring; + return d; + +fail: + nm_close(d); + if (errmsg[0]) + D("%s %s", errmsg, ifname); + if (errno == 0) + errno = EINVAL; + return NULL; +} + +static int nm_close(struct nm_desc *d) +{ + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] + __attribute__((unused)) = { (void *)nm_open, (void *)nm_inject, (void *)nm_dispatch, (void *)nm_nextpkt }; + + if (d == NULL || d->self != d) + return EINVAL; + if (d->done_mmap && d->mem) + munmap(d->mem, d->memsize); + if (d->fd != -1) { + close(d->fd); + } + + bzero(d, sizeof(*d)); + free(d); + return 0; +} + +static int nm_mmap(struct nm_desc *d, const struct nm_desc *parent) +{ + //XXX TODO: check if mmap is already done + + if (IS_NETMAP_DESC(parent) && parent->mem && parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + D("do not mmap, inherit from parent"); + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + /* XXX TODO: check if memsize is too large (or there is overflow) */ + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, d->fd, 0); + if (d->mem == MAP_FAILED) { + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = (char *)d->mem + d->memsize; + } + + return 0; + +fail: + return EINVAL; +} + +/* + * Same prototype as pcap_inject(), only need to cast. + */ +static int nm_inject(struct nm_desc *d, const void *buf, size_t size) +{ + u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; + + for (c = 0; c < n; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_tx_ring + c; + + if (ri > d->last_tx_ring) + ri = d->first_tx_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_tx_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + +/* + * Same prototype as pcap_dispatch(), only need to cast. + */ +static int nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +{ + int n = d->last_rx_ring - d->first_rx_ring + 1; + int c, got = 0, ri = d->cur_rx_ring; + + if (cnt == 0) + cnt = -1; + /* cnt == -1 means infinite, but rings have a finite amount + * of buffers and the int is large enough that we never wrap, + * so we can omit checking for -1 + */ + for (c = 0; c < n && cnt != got; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + + ri = d->cur_rx_ring + c; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + ring = NETMAP_RXRING(d->nifp, ri); + for (; !nm_ring_empty(ring) && cnt != got; got++) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + d->hdr.len = d->hdr.caplen = ring->slot[i].len; + d->hdr.ts = ring->ts; + cb(arg, &d->hdr, buf); + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + d->cur_rx_ring = ri; + return got; +} + +static u_char *nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +{ + int ri = d->cur_rx_ring; + + do { + /* compute current ring to use */ + struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); + if (!nm_ring_empty(ring)) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + hdr->ts = ring->ts; + hdr->len = hdr->caplen = ring->slot[i].len; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; + d->cur_rx_ring = ri; + return buf; + } + ri++; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + } while (ri != d->cur_rx_ring); + return NULL; /* nothing found */ +} + +#endif /* !HAVE_NETMAP_WITH_LIBS */ + +#endif /* NETMAP_WITH_LIBS */ + +#endif /* _NET_NETMAP_USER_H_ */ diff --git a/lib/flash/mtcp/include/pacing.h b/lib/flash/mtcp/include/pacing.h new file mode 100644 index 0000000..f7cf1da --- /dev/null +++ b/lib/flash/mtcp/include/pacing.h @@ -0,0 +1,63 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __PACING_H_ +#define __PACING_H_ + +#include "tcp_stream.h" +#include "clock.h" + +#if RATE_LIMIT_ENABLED +typedef struct token_bucket { + double tokens; + uint32_t rate; + uint32_t burst; + uint32_t last_fill_t; +} token_bucket; + +token_bucket *NewTokenBucket(); +int SufficientTokens(token_bucket *bucket, uint64_t new_bits); +void PrintBucket(token_bucket *bucket); +#endif + +#if PACING_ENABLED +typedef struct packet_pacer { + uint32_t rate_bps; + uint32_t extra_packets; + uint32_t next_send_time; +} packet_pacer; + +packet_pacer *NewPacketPacer(); +int CanSendNow(packet_pacer *pacer); +void PrintPacer(packet_pacer *pacer); +#endif + +#endif diff --git a/lib/flash/mtcp/include/pipe.h b/lib/flash/mtcp/include/pipe.h new file mode 100644 index 0000000..a12aaea --- /dev/null +++ b/lib/flash/mtcp/include/pipe.h @@ -0,0 +1,45 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MTCP_PIPE_H +#define MTCP_PIPE_H + +#include + +int PipeRead(mctx_t mctx, int pipeid, char *buf, int len); + +int PipeWrite(mctx_t mctx, int pipeid, const char *buf, int len); + +int RaisePendingPipeEvents(mctx_t mctx, int epid, int pipeid); + +int PipeClose(mctx_t mctx, int pipeid); + +#endif /* MTCP_PIPE_H */ diff --git a/lib/flash/mtcp/include/ps.h b/lib/flash/mtcp/include/ps.h new file mode 100644 index 0000000..3f5ce31 --- /dev/null +++ b/lib/flash/mtcp/include/ps.h @@ -0,0 +1,346 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _PS_H_ +#define _PS_H_ + +#define MAX_DEVICES 16 +#define MAX_RINGS 64 + +/* IN: option for ps_wait(); */ +#define PS_CTL_IN 0x1 /* The associated queue is available to read */ +#define PS_CTL_OUT 0x2 /* The associated queue is available to write */ +/* The associated queue is available to write or read */ +#define PS_CTL_INOUT (PS_CTL_IN | PS_CTL_OUT) + +/* OUT: return values for ps_wait() */ +#define PS_SEND_AVAILABLE 0x1 /* The associated queue is available to read */ +#define PS_RECEIVE_AVAILABLE 0x2 /* The associated queue is available to write */ +/* The associated queue is available to read and write */ +#define PS_ALL_AVAILABLE (PS_SEND_AVAILABLE | PS_RECEIVE_AVAILABLE) + +#define PS_SEND_MIN 256 + +#ifdef __KERNEL__ + +#define PS_MAJOR 1010 +#define PS_NAME "packet_shader" + +#define MAX_BUFS (12 * 4) + +struct ____cacheline_aligned ps_context { + struct semaphore sem; + + wait_queue_head_t wq; + + int num_attached; + struct ixgbe_ring *rx_rings[MAX_RINGS]; + int next_ring; + + struct ps_pkt_info *info; + /* char *buf; */ + + int num_bufs; + int buf_refcnt[MAX_BUFS]; + char *kbufs[MAX_BUFS]; + char __user *ubufs[MAX_BUFS]; +}; + +#else /* __KERNEL__ */ + +#include +#include +#include +#include + +#define __user + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif + +#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#if defined(__i386__) || defined(__x86_64__) +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + asm(" movl (%1), %0\n" + " subl $4, %2\n" + " jbe 2f\n" + " addl 4(%1), %0\n" + " adcl 8(%1), %0\n" + " adcl 12(%1), %0\n" + "1: adcl 16(%1), %0\n" + " lea 4(%1), %1\n" + " decl %2\n" + " jne 1b\n" + " adcl $0, %0\n" + " movl %0, %2\n" + " shrl $16, %0\n" + " addw %w2, %w0\n" + " adcl $0, %0\n" + " notl %0\n" + "2:" + /* Since the input registers which are loaded with iph and ih + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r"(sum), "=r"(iph), "=r"(ihl) + : "1"(iph), "2"(ihl) + : "memory"); + return (__sum16)sum; +} +#else +#define __force +typedef unsigned int u32; + +static inline __sum16 csum_fold(__wsum csum) +{ + u32 sum = (__force u32)csum; + ; + + sum += (sum << 16); + csum = (sum < csum); + sum >>= 16; + sum += csum; + + return (__force __sum16)~sum; +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik , adapted for linux by + * Arnt Gulbrandsen. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + const unsigned int *word = iph; + const unsigned int *stop = word + ihl; + unsigned int csum; + int carry; + + csum = word[0]; + csum += word[1]; + carry = (csum < word[1]); + csum += carry; + + csum += word[2]; + carry = (csum < word[2]); + csum += carry; + + csum += word[3]; + carry = (csum < word[3]); + csum += carry; + + word += 4; + do { + csum += *word; + carry = (csum < *word); + csum += carry; + word++; + } while (word != stop); + + return csum_fold(csum); +} +#endif + +#endif /* __KERNEL__ */ + +struct ps_device { + char name[IFNAMSIZ]; + char dev_addr[ETH_ALEN]; + uint32_t ip_addr; /* network order */ + + /* NOTE: this is different from kernel's internal index */ + int ifindex; + + /* This is kernel's ifindex. */ + int kifindex; + + int num_rx_queues; + int num_tx_queues; +}; + +struct ps_queue { + int ifindex; + int qidx; +}; + +#define MAX_PACKET_SIZE 2048 +#define MAX_CHUNK_SIZE 4096 +#define ENTRY_CNT 4096 + +#define PS_CHECKSUM_RX_UNKNOWN 0 +#define PS_CHECKSUM_RX_GOOD 1 +#define PS_CHECKSUM_RX_BAD 2 + +struct ps_pkt_info { + uint32_t offset; + uint16_t len; + uint8_t checksum_rx; +}; + +struct ps_chunk { + /* number of packets to send/recv */ + int cnt; + int recv_blocking; + + /* + for RX: output (where did these packets come from?) + for TX: input (which interface do you want to xmit?) + */ + struct ps_queue queue; + + struct ps_pkt_info __user *info; + char __user *buf; +}; + +struct ps_chunk_buf { + uint16_t cnt; + uint16_t next_to_use; + uint16_t next_to_send; + uint32_t next_offset; + + struct ps_queue queue; + void __user *lock; + struct ps_pkt_info __user *info; + char __user *buf; +}; + +struct ps_packet { + int ifindex; + int len; + char __user *buf; +}; + +#define NID_ZERO(isp) (isp = 0) +#define NID_SET(id, isp) (isp |= 1 << id) +#define NID_CLR(id, isp) (isp &= ~(1 << id)) +#define NID_ISSET(id, isp) (isp & (1 << id)) + +// maximum number of interface descriptor is 16 +typedef uint16_t nids_set; +struct ps_event { + long timeout; + int qidx; + + nids_set rx_nids; + nids_set tx_nids; +}; + +static inline void prefetcht0(void *p) +{ + asm volatile("prefetcht0 (%0)\n\t" : : "r"(p)); +} + +static inline void prefetchnta(void *p) +{ + asm volatile("prefetchnta (%0)\n\t" : : "r"(p)); +} + +static inline void memcpy_aligned(void *to, const void *from, size_t len) +{ + if (len <= 64) { + memcpy(to, from, 64); + } else if (len <= 128) { + memcpy(to, from, 64); + memcpy((uint8_t *)to + 64, (const uint8_t *)from + 64, 64); + } else { + size_t offset; + + for (offset = 0; offset < len; offset += 64) + memcpy((uint8_t *)to + offset, (const uint8_t *)from + offset, 64); + } +} + +#define PS_IOC_LIST_DEVICES 0 +#define PS_IOC_ATTACH_RX_DEVICE 1 +#define PS_IOC_DETACH_RX_DEVICE 2 +#define PS_IOC_RECV_CHUNK 3 +#define PS_IOC_SEND_CHUNK 4 +#define PS_IOC_SLOWPATH_PACKET 5 +#define PS_IOC_RECV_CHUNK_IFIDX 6 +#define PS_IOC_SEND_CHUNK_BUF 7 +#define PS_IOC_GET_TXENTRY 8 +#define PS_IOC_SELECT 9 + +#ifndef __KERNEL__ + +struct ps_handle { + int fd; + + uint64_t rx_chunks[MAX_DEVICES]; + uint64_t rx_packets[MAX_DEVICES]; + uint64_t rx_bytes[MAX_DEVICES]; + + uint64_t tx_chunks[MAX_DEVICES]; + uint64_t tx_packets[MAX_DEVICES]; + uint64_t tx_bytes[MAX_DEVICES]; + + void *priv; +}; + +int ps_list_devices(struct ps_device *devices); +int ps_init_handle(struct ps_handle *handle); +void ps_close_handle(struct ps_handle *handle); +int ps_attach_rx_device(struct ps_handle *handle, struct ps_queue *queue); +int ps_detach_rx_device(struct ps_handle *handle, struct ps_queue *queue); +int ps_alloc_chunk(struct ps_handle *handle, struct ps_chunk *chunk); +void ps_free_chunk(struct ps_chunk *chunk); +int ps_alloc_chunk_buf(struct ps_handle *handle, int ifidx, int qidx, struct ps_chunk_buf *c_buf); +void ps_free_chunk_buf(struct ps_chunk_buf *c_buf); +char *ps_assign_chunk_buf(struct ps_chunk_buf *c_buf, int len); +int ps_recv_chunk(struct ps_handle *handle, struct ps_chunk *chunk); +int ps_recv_chunk_ifidx(struct ps_handle *handle, struct ps_chunk *chunk, int ifidx); +int ps_send_chunk(struct ps_handle *handle, struct ps_chunk *chunk); +int ps_send_chunk_buf(struct ps_handle *handle, struct ps_chunk_buf *chunk); +int ps_select(struct ps_handle *handle, struct ps_event *event); +int ps_get_txentry(struct ps_handle *handle, struct ps_queue *queue); +int ps_slowpath_packet(struct ps_handle *handle, struct ps_packet *packet); + +void dump_packet(char *buf, int len); +void dump_chunk(struct ps_chunk *chunk); + +int get_num_cpus(void); +int bind_cpu(int cpu); +uint64_t rdtsc(void); + +#endif + +#endif /* _PS_H_ */ diff --git a/lib/flash/mtcp/include/rss.h b/lib/flash/mtcp/include/rss.h new file mode 100644 index 0000000..8c667b1 --- /dev/null +++ b/lib/flash/mtcp/include/rss.h @@ -0,0 +1,40 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RSS_H +#define RSS_H + +#include + +/* sip, dip, sp, dp: in network byte order */ +int GetRSSCPUCore(in_addr_t sip, in_addr_t dip, in_port_t sp, in_port_t dp, int num_queues, uint8_t endian_check); + +#endif /* RSS_H */ diff --git a/lib/flash/mtcp/include/socket.h b/lib/flash/mtcp/include/socket.h new file mode 100644 index 0000000..0e09c2c --- /dev/null +++ b/lib/flash/mtcp/include/socket.h @@ -0,0 +1,87 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SOCKET_H +#define SOCKET_H + +#include "mtcp_api.h" +#include "mtcp_epoll.h" + +/*----------------------------------------------------------------------------*/ +enum socket_opts { + MTCP_NONBLOCK = 0x01, + MTCP_ADDR_BIND = 0x02, +}; +/*----------------------------------------------------------------------------*/ +struct socket_map { + int id; + int socktype; + uint32_t opts; + + struct sockaddr_in saddr; + + union { + struct tcp_stream *stream; + struct tcp_listener *listener; + struct mtcp_epoll *ep; + struct pipe *pp; + }; + + uint32_t epoll; /* registered events */ + uint32_t events; /* available events */ + mtcp_epoll_data_t ep_data; + + TAILQ_ENTRY(socket_map) free_smap_link; +}; +/*----------------------------------------------------------------------------*/ +typedef struct socket_map *socket_map_t; +/*----------------------------------------------------------------------------*/ +socket_map_t AllocateSocket(mctx_t mctx, int socktype, int need_lock); +/*----------------------------------------------------------------------------*/ +void FreeSocket(mctx_t mctx, int sockid, int need_lock); +/*----------------------------------------------------------------------------*/ +socket_map_t GetSocket(mctx_t mctx, int sockid); +/*----------------------------------------------------------------------------*/ +struct tcp_listener { + int sockid; + socket_map_t socket; + + int backlog; + stream_queue_t acceptq; + + pthread_mutex_t accept_lock; + pthread_cond_t accept_cond; + + TAILQ_ENTRY(tcp_listener) he_link; /* hash table entry link */ +}; +/*----------------------------------------------------------------------------*/ + +#endif /* SOCKET_H */ diff --git a/lib/flash/mtcp/include/stat.h b/lib/flash/mtcp/include/stat.h new file mode 100644 index 0000000..f7dd1b3 --- /dev/null +++ b/lib/flash/mtcp/include/stat.h @@ -0,0 +1,110 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef STAT_H +#define STAT_H + +struct run_stat { + uint64_t rounds; + uint64_t rounds_rx; + uint64_t rounds_rx_try; + uint64_t rounds_tx; + uint64_t rounds_tx_try; + uint64_t rounds_select; + uint64_t rounds_select_rx; + uint64_t rounds_select_tx; + uint64_t rounds_select_intr; + + uint64_t rounds_accept; + uint64_t rounds_read; + uint64_t rounds_write; + uint64_t rounds_epoll; + uint64_t rounds_wndadv; + + uint64_t rounds_rtocheck; + uint64_t rounds_twcheck; + uint64_t rounds_tocheck; +}; + +struct stat_counter { + uint64_t cnt; + uint64_t sum; + uint64_t max; + uint64_t min; +}; + +struct time_stat { + struct stat_counter round; + struct stat_counter processing; + struct stat_counter tcheck; + struct stat_counter epoll; + struct stat_counter handle; + struct stat_counter xmit; + struct stat_counter select; +}; + +struct net_stat { + uint64_t tx_packets[MAX_DEVICES]; + uint64_t tx_bytes[MAX_DEVICES]; + uint64_t tx_drops[MAX_DEVICES]; + uint64_t rx_packets[MAX_DEVICES]; + uint64_t rx_bytes[MAX_DEVICES]; + uint64_t rx_errors[MAX_DEVICES]; +#ifdef ENABLELRO + uint64_t tx_gdptbytes; + uint64_t rx_gdptbytes; +#endif +}; + +struct bcast_stat { + uint64_t cycles; + uint64_t write; + uint64_t read; + uint64_t epoll; + uint64_t wnd_adv; + uint64_t ack; +}; + +struct timeout_stat { + uint64_t cycles; + uint64_t rto_try; + uint64_t rto; + uint64_t timewait_try; + uint64_t timewait; +}; + +#ifdef NETSTAT +#define STAT_COUNT(stat) stat++ +#else +#define STAT_COUNT(stat) +#endif + +#endif /* STAT_H */ diff --git a/lib/flash/mtcp/include/tcp_in.h b/lib/flash/mtcp/include/tcp_in.h new file mode 100644 index 0000000..8479409 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_in.h @@ -0,0 +1,149 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_IN_H +#define TCP_IN_H + +#include +#include +#include +#include + +#include "mtcp.h" +#include "fhash.h" + +#ifndef TCP_FLAGS +#define TCP_FLAGS +#define TCP_FLAG_FIN 0x01 // 0000 0001 +#define TCP_FLAG_SYN 0x02 // 0000 0010 +#define TCP_FLAG_RST 0x04 // 0000 0100 +#define TCP_FLAG_PSH 0x08 // 0000 1000 +#define TCP_FLAG_ACK 0x10 // 0001 0000 +#define TCP_FLAG_URG 0x20 // 0010 0000 +#endif +#define TCP_FLAG_SACK 0x40 // 0100 0000 +#define TCP_FLAG_WACK 0x80 // 1000 0000 + +#define TCP_OPT_FLAG_MSS 0x02 // 0000 0010 +#define TCP_OPT_FLAG_WSCALE 0x04 // 0000 0100 +#define TCP_OPT_FLAG_SACK_PERMIT 0x08 // 0000 1000 +#define TCP_OPT_FLAG_SACK 0x10 // 0001 0000 +#define TCP_OPT_FLAG_TIMESTAMP 0x20 // 0010 0000 + +#define TCP_OPT_MSS_LEN 4 +#define TCP_OPT_WSCALE_LEN 3 +#define TCP_OPT_SACK_PERMIT_LEN 2 +#define TCP_OPT_SACK_LEN 10 +#define TCP_OPT_TIMESTAMP_LEN 10 + +#define TCP_DEFAULT_MSS 1460 +#define TCP_DEFAULT_WSCALE 7 +#define TCP_INITIAL_WINDOW 14600 // initial window size + +#define TCP_SEQ_LT(a, b) ((int32_t)((a) - (b)) < 0) +#define TCP_SEQ_LEQ(a, b) ((int32_t)((a) - (b)) <= 0) +#define TCP_SEQ_GT(a, b) ((int32_t)((a) - (b)) > 0) +#define TCP_SEQ_GEQ(a, b) ((int32_t)((a) - (b)) >= 0) +#define TCP_SEQ_BETWEEN(a, b, c) (TCP_SEQ_GEQ(a, b) && TCP_SEQ_LEQ(a, c)) + +/* convert timeval to timestamp (precision: 1 ms) */ +#define HZ 1000 +#define TIME_TICK (1000000 / HZ) // in us +#define TIMEVAL_TO_TS(t) (uint32_t)((t)->tv_sec * HZ + ((t)->tv_usec / TIME_TICK)) + +#define TS_TO_USEC(t) ((t) * TIME_TICK) +#define TS_TO_MSEC(t) (TS_TO_USEC(t) / 1000) + +#define USEC_TO_TS(t) ((t) / TIME_TICK) +#define MSEC_TO_TS(t) (USEC_TO_TS((t) * 1000)) +#define SEC_TO_TS(t) (t * HZ) + +#define SEC_TO_USEC(t) ((t) * 1000000) +#define SEC_TO_MSEC(t) ((t) * 1000) +#define MSEC_TO_USEC(t) ((t) * 1000) +#define USEC_TO_SEC(t) ((t) / 1000000) +//#define TCP_TIMEWAIT (MSEC_TO_USEC(5000) / TIME_TICK) // 5s +#define TCP_TIMEWAIT 0 +#define TCP_INITIAL_RTO (MSEC_TO_USEC(500) / TIME_TICK) // 500ms +#define TCP_FIN_RTO (MSEC_TO_USEC(500) / TIME_TICK) // 500ms +#define TCP_TIMEOUT (MSEC_TO_USEC(30000) / TIME_TICK) // 30s + +#define TCP_MAX_RTX 16 +#define TCP_MAX_SYN_RETRY 7 +#define TCP_MAX_BACKOFF 7 + +#define TCP_INIT_CWND 2 + +enum tcp_state { + TCP_ST_CLOSED = 0, + TCP_ST_LISTEN = 1, + TCP_ST_SYN_SENT = 2, + TCP_ST_SYN_RCVD = 3, + TCP_ST_ESTABLISHED = 4, + TCP_ST_FIN_WAIT_1 = 5, + TCP_ST_FIN_WAIT_2 = 6, + TCP_ST_CLOSE_WAIT = 7, + TCP_ST_CLOSING = 8, + TCP_ST_LAST_ACK = 9, + TCP_ST_TIME_WAIT = 10 +}; + +enum tcp_option { + TCP_OPT_END = 0, + TCP_OPT_NOP = 1, + TCP_OPT_MSS = 2, + TCP_OPT_WSCALE = 3, + TCP_OPT_SACK_PERMIT = 4, + TCP_OPT_SACK = 5, + TCP_OPT_TIMESTAMP = 8 +}; + +enum tcp_close_reason { + TCP_NOT_CLOSED = 0, + TCP_ACTIVE_CLOSE = 1, + TCP_PASSIVE_CLOSE = 2, + TCP_CONN_FAIL = 3, + TCP_CONN_LOST = 4, + TCP_RESET = 5, + TCP_NO_MEM = 6, + TCP_NOT_ACCEPTED = 7, + TCP_TIMEDOUT = 8 +}; + +void ParseTCPOptions(tcp_stream *cur_stream, uint32_t cur_ts, const uint8_t *tcpopt, int len); + +extern inline int ProcessTCPUplink(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, const struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, uint8_t *payload, int payloadlen, uint32_t window); + +int ProcessTCPPacket(struct mtcp_manager *mtcp, uint32_t cur_ts, const int ifidx, const struct iphdr *iph, int ip_len); +uint16_t TCPCalcChecksum(uint16_t *buf, uint16_t len, uint32_t saddr, uint32_t daddr); + +#endif /* TCP_IN_H */ diff --git a/lib/flash/mtcp/include/tcp_out.h b/lib/flash/mtcp/include/tcp_out.h new file mode 100644 index 0000000..973b49b --- /dev/null +++ b/lib/flash/mtcp/include/tcp_out.h @@ -0,0 +1,67 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_OUT_H +#define TCP_OUT_H + +#include "mtcp.h" +#include "tcp_stream.h" + +enum ack_opt { ACK_OPT_NOW, ACK_OPT_AGGREGATE, ACK_OPT_WACK }; + +int SendTCPPacketStandalone(struct mtcp_manager *mtcp, uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport, uint32_t seq, + uint32_t ack_seq, uint16_t window, uint8_t flags, uint8_t *payload, uint16_t payloadlen, uint32_t cur_ts, + uint32_t echo_ts); + +int SendTCPPacket(struct mtcp_manager *mtcp, tcp_stream *cur_stream, uint32_t cur_ts, uint8_t flags, uint8_t *payload, + uint16_t payloadlen); + +extern inline int WriteTCPControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh); + +extern inline int WriteTCPDataList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh); + +extern inline int WriteTCPACKList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh); + +extern inline void AddtoControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts); + +extern inline void AddtoSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void RemoveFromControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void RemoveFromSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void RemoveFromACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void EnqueueACK(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, uint8_t opt); + +extern inline void DumpControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender); + +#endif /* TCP_OUT_H */ diff --git a/lib/flash/mtcp/include/tcp_rb_frag_queue.h b/lib/flash/mtcp/include/tcp_rb_frag_queue.h new file mode 100644 index 0000000..1aa85f7 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_rb_frag_queue.h @@ -0,0 +1,49 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_RB_FRAG_QUEUE +#define TCP_RB_FRAG_QUEUE + +#include "tcp_ring_buffer.h" + +/*---------------------------------------------------------------------------*/ +typedef struct rb_frag_queue *rb_frag_queue_t; +/*---------------------------------------------------------------------------*/ +rb_frag_queue_t CreateRBFragQueue(int capacity); +/*---------------------------------------------------------------------------*/ +void DestroyRBFragQueue(rb_frag_queue_t rb_fragq); +/*---------------------------------------------------------------------------*/ +int RBFragEnqueue(rb_frag_queue_t rb_fragq, struct fragment_ctx *frag); +/*---------------------------------------------------------------------------*/ +struct fragment_ctx *RBFragDequeue(rb_frag_queue_t rb_fragq); +/*---------------------------------------------------------------------------*/ + +#endif /* TCP_RB_FRAG_QUEUE */ diff --git a/lib/flash/mtcp/include/tcp_ring_buffer.h b/lib/flash/mtcp/include/tcp_ring_buffer.h new file mode 100644 index 0000000..9f7d2ac --- /dev/null +++ b/lib/flash/mtcp/include/tcp_ring_buffer.h @@ -0,0 +1,100 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * 2010.12.10 Shinae Woo + * Ring buffer structure for managing dynamically allocating ring buffer + * + * put data to the tail + * get/pop/remove data from the head + * + * always garantee physically continuous ready in-memory data from data_offset to the data_offset+len + * automatically increase total buffer size when buffer is full + * for efficiently managing packet payload and chunking + * + */ + +#ifndef NRE_RING_BUFFER +#define NRE_RING_BUFFER + +#include +#include + +/*----------------------------------------------------------------------------*/ +enum rb_caller { AT_APP, AT_MTCP }; +/*----------------------------------------------------------------------------*/ +typedef struct mtcp_manager *mtcp_manager_t; +typedef struct rb_manager *rb_manager_t; +/*----------------------------------------------------------------------------*/ +struct fragment_ctx { + uint32_t seq; + uint32_t len : 31; + uint32_t is_calloc : 1; + struct fragment_ctx *next; +}; +/*----------------------------------------------------------------------------*/ +struct tcp_ring_buffer { + u_char *data; /* buffered data */ + u_char *head; /* pointer to the head */ + + uint32_t head_offset; /* offset for the head (head - data) */ + uint32_t tail_offset; /* offset fot the last byte (null byte) */ + + int merged_len; /* contiguously merged length */ + uint64_t cum_len; /* cummulatively merged length */ + int last_len; /* currently saved data length */ + int size; /* total ring buffer size */ + + /* TCP payload features */ + uint32_t head_seq; + uint32_t init_seq; + + struct fragment_ctx *fctx; +}; +/*----------------------------------------------------------------------------*/ +uint32_t RBGetCurnum(rb_manager_t rbm); +void RBPrintInfo(struct tcp_ring_buffer *buff); +void RBPrintStr(struct tcp_ring_buffer *buff); +void RBPrintHex(struct tcp_ring_buffer *buff); +/*----------------------------------------------------------------------------*/ +rb_manager_t RBManagerCreate(mtcp_manager_t mtcp, size_t chunk_size, uint32_t cnum); +/*----------------------------------------------------------------------------*/ +struct tcp_ring_buffer *RBInit(rb_manager_t rbm, uint32_t init_seq); +void RBFree(rb_manager_t rbm, struct tcp_ring_buffer *buff); +uint32_t RBIsDanger(rb_manager_t rbm); +/*----------------------------------------------------------------------------*/ +/* data manupulation functions */ +int RBPut(rb_manager_t rbm, struct tcp_ring_buffer *buff, void *data, uint32_t len, uint32_t seq); +size_t RBGet(rb_manager_t rbm, struct tcp_ring_buffer *buff, size_t len); +size_t RBRemove(rb_manager_t rbm, struct tcp_ring_buffer *buff, size_t len, int option); +/*----------------------------------------------------------------------------*/ + +#endif diff --git a/lib/flash/mtcp/include/tcp_sb_queue.h b/lib/flash/mtcp/include/tcp_sb_queue.h new file mode 100644 index 0000000..300e0f0 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_sb_queue.h @@ -0,0 +1,49 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_SB_QUEUE +#define TCP_SB_QUEUE + +#include "tcp_send_buffer.h" + +/*---------------------------------------------------------------------------*/ +typedef struct sb_queue *sb_queue_t; +/*---------------------------------------------------------------------------*/ +sb_queue_t CreateSBQueue(int capacity); +/*---------------------------------------------------------------------------*/ +void DestroySBQueue(sb_queue_t sq); +/*---------------------------------------------------------------------------*/ +int SBEnqueue(sb_queue_t sq, struct tcp_send_buffer *buf); +/*---------------------------------------------------------------------------*/ +struct tcp_send_buffer *SBDequeue(sb_queue_t sq); +/*---------------------------------------------------------------------------*/ + +#endif /* TCP_SB_QUEUE */ diff --git a/lib/flash/mtcp/include/tcp_send_buffer.h b/lib/flash/mtcp/include/tcp_send_buffer.h new file mode 100644 index 0000000..58dff83 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_send_buffer.h @@ -0,0 +1,69 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_SEND_BUFFER_H +#define TCP_SEND_BUFFER_H + +#include +#include + +/*----------------------------------------------------------------------------*/ +typedef struct sb_manager *sb_manager_t; +typedef struct mtcp_manager *mtcp_manager_t; +/*----------------------------------------------------------------------------*/ +struct tcp_send_buffer { + unsigned char *data; + unsigned char *head; + + uint32_t head_off; + uint32_t tail_off; + uint32_t len; + uint64_t cum_len; + uint32_t size; + + uint32_t head_seq; + uint32_t init_seq; +}; +/*----------------------------------------------------------------------------*/ +uint32_t SBGetCurnum(sb_manager_t sbm); +/*----------------------------------------------------------------------------*/ +sb_manager_t SBManagerCreate(mtcp_manager_t mtcp, size_t chunk_size, uint32_t cnum); +/*----------------------------------------------------------------------------*/ +struct tcp_send_buffer *SBInit(sb_manager_t sbm, uint32_t init_seq); +/*----------------------------------------------------------------------------*/ +void SBFree(sb_manager_t sbm, struct tcp_send_buffer *buf); +/*----------------------------------------------------------------------------*/ +size_t SBPut(sb_manager_t sbm, struct tcp_send_buffer *buf, const void *data, size_t len); +/*----------------------------------------------------------------------------*/ +size_t SBRemove(sb_manager_t sbm, struct tcp_send_buffer *buf, size_t len); +/*----------------------------------------------------------------------------*/ + +#endif /* TCP_SEND_BUFFER_H */ diff --git a/lib/flash/mtcp/include/tcp_stream.h b/lib/flash/mtcp/include/tcp_stream.h new file mode 100644 index 0000000..27ebf57 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_stream.h @@ -0,0 +1,276 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_STREAM_H +#define TCP_STREAM_H + +#include +#include +#include + +#include "mtcp.h" + +#ifndef RATE_LIMIT_ENABLED +#define RATE_LIMIT_ENABLED FALSE +#endif + +#ifndef PACING_ENABLED +#define PACING_ENABLED FALSE +#endif + +#if RATE_LIMIT_ENABLED || PACING_ENABLED +#include "pacing.h" +#endif + +struct rtm_stat { + uint32_t tdp_ack_cnt; + uint32_t tdp_ack_bytes; + uint32_t ack_upd_cnt; + uint32_t ack_upd_bytes; +#if TCP_OPT_SACK_ENABLED + uint32_t sack_cnt; + uint32_t sack_bytes; + uint32_t tdp_sack_cnt; + uint32_t tdp_sack_bytes; +#endif /* TCP_OPT_SACK_ENABLED */ + uint32_t rto_cnt; + uint32_t rto_bytes; +}; + +#if TCP_OPT_SACK_ENABLED +struct sack_entry { + uint32_t left_edge; + uint32_t right_edge; + uint32_t expire; +}; +#endif /* TCP_OPT_SACK_ENABLED */ + +struct tcp_recv_vars { + /* receiver variables */ + uint32_t rcv_wnd; /* receive window (unscaled) */ + //uint32_t rcv_up; /* receive urgent pointer */ + uint32_t irs; /* initial receiving sequence */ + uint32_t snd_wl1; /* segment seq number for last window update */ + uint32_t snd_wl2; /* segment ack number for last window update */ + + /* variables for fast retransmission */ + uint8_t dup_acks; /* number of duplicated acks */ + uint32_t last_ack_seq; /* highest ackd seq */ + + /* timestamps */ + uint32_t ts_recent; /* recent peer timestamp */ + uint32_t ts_lastack_rcvd; /* last ack rcvd time */ + uint32_t ts_last_ts_upd; /* last peer ts update time */ + uint32_t ts_tw_expire; // timestamp for timewait expire + + /* RTT estimation variables */ + uint32_t srtt; /* smoothed round trip time << 3 (scaled) */ + uint32_t mdev; /* medium deviation */ + uint32_t mdev_max; /* maximal mdev ffor the last rtt period */ + uint32_t rttvar; /* smoothed mdev_max */ + uint32_t rtt_seq; /* sequence number to update rttvar */ + +#if TCP_OPT_SACK_ENABLED /* currently not used */ +#define MAX_SACK_ENTRY 8 + uint32_t sacked_pkts; + struct sack_entry sack_table[MAX_SACK_ENTRY]; + uint8_t sacks : 3; +#endif /* TCP_OPT_SACK_ENABLED */ + + struct tcp_ring_buffer *rcvbuf; +#if USE_SPIN_LOCK + pthread_spinlock_t read_lock; +#else + pthread_mutex_t read_lock; +#endif + + TAILQ_ENTRY(tcp_stream) he_link; /* hash table entry link */ + +#if BLOCKING_SUPPORT + TAILQ_ENTRY(tcp_stream) rcv_br_link; + pthread_cond_t read_cond; +#endif +}; + +struct tcp_send_vars { + /* IP-level information */ + uint16_t ip_id; + + uint16_t mss; /* maximum segment size */ + uint16_t eff_mss; /* effective segment size (excluding tcp option) */ + + uint8_t wscale_mine; /* my window scale (adertising window) */ + uint8_t wscale_peer; /* peer's window scale (advertised window) */ + int8_t nif_out; /* cached output network interface */ + unsigned char *d_haddr; /* cached destination MAC address */ + + /* send sequence variables */ + uint32_t snd_una; /* send unacknoledged */ + uint32_t snd_wnd; /* send window (unscaled) */ + uint32_t peer_wnd; /* client window size */ + //uint32_t snd_up; /* send urgent pointer (not used) */ + uint32_t iss; /* initial sending sequence */ + uint32_t fss; /* final sending sequence */ + + /* retransmission timeout variables */ + uint8_t nrtx; /* number of retransmission */ + uint8_t max_nrtx; /* max number of retransmission */ + uint32_t rto; /* retransmission timeout */ + uint32_t ts_rto; /* timestamp for retransmission timeout */ + + /* congestion control variables */ + uint32_t cwnd; /* congestion window */ + uint32_t ssthresh; /* slow start threshold */ +#if USE_CCP + uint32_t missing_seq; +#endif + + /* timestamp */ + uint32_t ts_lastack_sent; /* last ack sent time */ + + uint8_t is_wack : 1, /* is ack for window adertisement? */ + ack_cnt : 6; /* number of acks to send. max 64 */ + + uint8_t on_control_list; + uint8_t on_send_list; + uint8_t on_ack_list; + uint8_t on_sendq; + uint8_t on_ackq; + uint8_t on_closeq; + uint8_t on_resetq; + + uint8_t on_closeq_int : 1, on_resetq_int : 1, is_fin_sent : 1, is_fin_ackd : 1; + + TAILQ_ENTRY(tcp_stream) control_link; + TAILQ_ENTRY(tcp_stream) send_link; + TAILQ_ENTRY(tcp_stream) ack_link; + + TAILQ_ENTRY(tcp_stream) timer_link; /* timer link (rto list, tw list) */ + TAILQ_ENTRY(tcp_stream) timeout_link; /* connection timeout link */ + + struct tcp_send_buffer *sndbuf; +#if USE_SPIN_LOCK + pthread_spinlock_t write_lock; +#else + pthread_mutex_t write_lock; +#endif + +#if RTM_STAT + struct rtm_stat rstat; /* retransmission statistics */ +#endif + +#if BLOCKING_SUPPORT + TAILQ_ENTRY(tcp_stream) snd_br_link; + pthread_cond_t write_cond; +#endif +}; + +typedef struct tcp_stream { + socket_map_t socket; + + uint32_t id : 24, stream_type : 8; + + uint32_t saddr; /* in network order */ + uint32_t daddr; /* in network order */ + uint16_t sport; /* in network order */ + uint16_t dport; /* in network order */ + + uint8_t state; /* tcp state */ + uint8_t close_reason; /* close reason */ + uint8_t on_hash_table; + uint8_t on_timewait_list; + uint8_t ht_idx; + uint8_t closed; + uint8_t is_bound_addr; + uint8_t need_wnd_adv; + int16_t on_rto_idx; + + uint16_t on_timeout_list : 1, on_rcv_br_list : 1, on_snd_br_list : 1, saw_timestamp : 1, /* whether peer sends timestamp */ + sack_permit : 1, /* whether peer permits SACK */ + control_list_waiting : 1, have_reset : 1, is_external : 1, /* the peer node is locate outside of lan */ + wait_for_acks : 1; /* if true, the sender should wait for acks to catch up before sending again */ + + uint32_t snd_nxt; /* send next */ + uint32_t rcv_nxt; /* receive next */ +#if USE_CCP + uint32_t seq_at_last_loss; /* the sequence number we left off at before we stopped at wait_for_acks (due to loss) */ +#endif + + struct tcp_recv_vars *rcvvar; + struct tcp_send_vars *sndvar; +#if RATE_LIMIT_ENABLED + struct token_bucket *bucket; +#endif +#if PACING_ENABLED + struct packet_pacer *pacer; +#endif +#if USE_CCP + struct ccp_connection *ccp_conn; +#endif + + uint32_t last_active_ts; /* ts_last_ack_sent or ts_last_ts_upd */ + +} tcp_stream; + +extern inline const char *TCPStateToString(const tcp_stream *cur_stream); + +unsigned int HashFlow(const void *flow); + +int EqualFlow(const void *flow1, const void *flow2); + +#if USE_CCP +/*----------------------------------------------------------------------------*/ +unsigned int HashSID(const void *flow); + +int EqualSID(const void *flow1, const void *flow2); +/*----------------------------------------------------------------------------*/ +#endif + +extern inline int AddEpollEvent(struct mtcp_epoll *ep, int queue_type, socket_map_t socket, uint32_t event); + +extern inline void RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream); + +extern inline void RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream); + +extern inline void RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream); + +extern inline void RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream); + +tcp_stream *CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type, uint32_t saddr, uint16_t sport, uint32_t daddr, + uint16_t dport); + +void DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream); + +void DumpStream(mtcp_manager_t mtcp, tcp_stream *stream); + +extern inline void InitializeTCPStreamManager(void); + +#endif /* TCP_STREAM_H */ diff --git a/lib/flash/mtcp/include/tcp_stream_queue.h b/lib/flash/mtcp/include/tcp_stream_queue.h new file mode 100644 index 0000000..9f961b9 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_stream_queue.h @@ -0,0 +1,105 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_STREAM_QUEUE +#define TCP_STREAM_QUEUE + +#include + +#ifndef LOCK_STREAM_QUEUE +#define LOCK_STREAM_QUEUE 0 +#endif + +/* Lock definitions for stream queue */ +#if LOCK_STREAM_QUEUE + +#if USE_SPIN_LOCK +#define SQ_LOCK_INIT(lock, errmsg, action) \ + ; \ + if (pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE)) { \ + perror("pthread_spin_init" errmsg); \ + action; \ + } +#define SQ_LOCK_DESTROY(lock) pthread_spin_destroy(lock) +#define SQ_LOCK(lock) pthread_spin_lock(lock) +#define SQ_UNLOCK(lock) pthread_spin_unlock(lock) +#else +#define SQ_LOCK_INIT(lock, errmsg, action) \ + ; \ + if (pthread_mutex_init(lock, NULL)) { \ + perror("pthread_mutex_init" errmsg); \ + action; \ + } +#define SQ_LOCK_DESTROY(lock) pthread_mutex_destroy(lock) +#define SQ_LOCK(lock) pthread_mutex_lock(lock) +#define SQ_UNLOCK(lock) pthread_mutex_unlock(lock) +#endif /* USE_SPIN_LOCK */ + +#else /* LOCK_STREAM_QUEUE */ +#define SQ_LOCK_INIT(lock, errmsg, action) (void)0 +#define SQ_LOCK_DESTROY(lock) (void)0 +#define SQ_LOCK(lock) (void)0 +#define SQ_UNLOCK(lock) (void)0 +#endif /* LOCK_STREAM_QUEUE */ + +/*---------------------------------------------------------------------------*/ +typedef struct stream_queue *stream_queue_t; +/*---------------------------------------------------------------------------*/ +typedef struct stream_queue_int { + struct tcp_stream **array; + int size; + + int first; + int last; + int count; + +} stream_queue_int; +/*---------------------------------------------------------------------------*/ +stream_queue_int *CreateInternalStreamQueue(int size); +/*---------------------------------------------------------------------------*/ +void DestroyInternalStreamQueue(stream_queue_int *sq); +/*---------------------------------------------------------------------------*/ +int StreamInternalEnqueue(stream_queue_int *sq, struct tcp_stream *stream); +/*---------------------------------------------------------------------------*/ +struct tcp_stream *StreamInternalDequeue(stream_queue_int *sq); +/*---------------------------------------------------------------------------*/ +stream_queue_t CreateStreamQueue(int size); +/*---------------------------------------------------------------------------*/ +void DestroyStreamQueue(stream_queue_t sq); +/*---------------------------------------------------------------------------*/ +int StreamEnqueue(stream_queue_t sq, struct tcp_stream *stream); +/*---------------------------------------------------------------------------*/ +struct tcp_stream *StreamDequeue(stream_queue_t sq); +/*---------------------------------------------------------------------------*/ +int StreamQueueIsEmpty(stream_queue_t sq); +/*---------------------------------------------------------------------------*/ + +#endif /* TCP_STREAM_QUEUE */ diff --git a/lib/flash/mtcp/include/tcp_util.h b/lib/flash/mtcp/include/tcp_util.h new file mode 100644 index 0000000..79703c2 --- /dev/null +++ b/lib/flash/mtcp/include/tcp_util.h @@ -0,0 +1,69 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TCP_UTIL_H +#define TCP_UTIL_H + +#include "mtcp.h" +#include "tcp_stream.h" + +#define MSS 1448 +#define INIT_CWND_PKTS 10 + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define SECONDS_TO_USECS(seconds) ((seconds) / 1000000.0) +#define USECS_TO_MS(us) ((us) / 1000) +#define BYTES_TO_BITS(bytes) ((bytes) / 8.0) +#define BPS_TO_MBPS(bps) ((bps) / 8000000.0) +#define UNSHIFT_RTT(srtt) ((srtt) * 125.0) + +struct tcp_timestamp { + uint32_t ts_val; + uint32_t ts_ref; +}; + +void ParseTCPOptions(tcp_stream *cur_stream, uint32_t cur_ts, const uint8_t *tcpopt, int len); + +extern inline int ParseTCPTimestamp(tcp_stream *cur_stream, struct tcp_timestamp *ts, uint8_t *tcpopt, int len); + +#if TCP_OPT_SACK_ENABLED +int SeqIsSacked(tcp_stream *cur_stream, uint32_t seq); + +void ParseSACKOption(tcp_stream *cur_stream, uint32_t ack_seq, uint8_t *tcpopt, int len); +#endif + +uint16_t TCPCalcChecksum(uint16_t *buf, uint16_t len, uint32_t saddr, uint32_t daddr); + +void PrintTCPOptions(uint8_t *tcpopt, int len); + +#endif /* TCP_UTIL_H */ diff --git a/lib/flash/mtcp/include/timer.h b/lib/flash/mtcp/include/timer.h new file mode 100644 index 0000000..c92daf4 --- /dev/null +++ b/lib/flash/mtcp/include/timer.h @@ -0,0 +1,71 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TIMER_H +#define TIMER_H + +#include "mtcp.h" +#include "tcp_stream.h" + +#define RTO_HASH 3000 + +struct rto_hashstore { + uint32_t rto_now_idx; // pointing the hs_table_s index + uint32_t rto_now_ts; // + + TAILQ_HEAD(rto_head, tcp_stream) rto_list[RTO_HASH + 1]; +}; + +struct rto_hashstore *InitRTOHashstore(void); + +extern inline void AddtoRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void RemoveFromRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void AddtoTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts); + +extern inline void RemoveFromTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void AddtoTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void RemoveFromTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void UpdateTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream); + +extern inline void UpdateRetransmissionTimer(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts); + +void CheckRtmTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh); + +void CheckTimewaitExpire(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh); + +void CheckConnectionTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh); + +#endif /* TIMER_H */ diff --git a/lib/flash/mtcp/io_module.c b/lib/flash/mtcp/io_module.c new file mode 100644 index 0000000..c10df91 --- /dev/null +++ b/lib/flash/mtcp/io_module.c @@ -0,0 +1,749 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* for I/O module def'ns */ +#include "io_module.h" +/* for num_devices decl */ +#include "config.h" +/* std lib funcs */ +#include +/* std io funcs */ +#include +/* strcmp func etc. */ +#include +/* for ifreq struct */ +#include +/* for ioctl */ +#include +#ifndef DISABLE_DPDK +#define RTE_ARGC_MAX (RTE_MAX_ETHPORTS << 1) + 9 +/* for dpdk ethernet functions (get mac addresses) */ +#include +#include +/* for ceil func */ +#include +/* for retrieving rte version(s) */ +#include +#endif /* DISABLE_DPDK */ +/* for TRACE_* */ +#include "debug.h" +/* for inet_* */ +#include +#include +#include +/* for getopt() */ +#include +/* for getifaddrs */ +#include +#include +#ifdef ENABLE_ONVM +/* for onvm */ +#include "onvm_nflib.h" +#include "onvm_pkt_helper.h" +/* for dpdk/onvm big ints */ +#include +#endif +/* for file opening */ +#include +#include +/* for netmap macros */ +#include "netmap_user.h" +/*----------------------------------------------------------------------------*/ +io_module_func *current_iomodule_func = &dpdk_module_func; +#ifndef DISABLE_DPDK +enum rte_proc_type_t eal_proc_type_detect(void); +/** + * DPDK's RTE consumes some huge pages for internal bookkeeping. + * Therefore, it is not always safe to reserve the exact amount + * of pages for our stack (e.g. dividing requested mem, in MB, by + * (1<<20) would be insufficient). Hence, the following value. + */ +#define RTE_SOCKET_MEM_SHIFT ((1 << 19) | (1 << 18)) +#endif +/*----------------------------------------------------------------------------*/ +#define ALL_STRING "all" +#define MAX_PROCLINE_LEN 1024 +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +/*----------------------------------------------------------------------------*/ + +/* onvm struct for port info lookup */ +extern struct port_info *ports; + +#ifndef DISABLE_PSIO +static int GetNumQueues() +{ + FILE *fp; + char buf[MAX_PROCLINE_LEN]; + int queue_cnt; + + fp = fopen("/proc/interrupts", "r"); + if (!fp) { + TRACE_CONFIG("Failed to read data from /proc/interrupts!\n"); + return -1; + } + + /* count number of NIC queues from /proc/interrupts */ + queue_cnt = 0; + while (!feof(fp)) { + if (fgets(buf, MAX_PROCLINE_LEN, fp) == NULL) + break; + + /* "xge0-rx" is the keyword for counting queues */ + if (strstr(buf, "xge0-rx")) { + queue_cnt++; + } + } + fclose(fp); + + return queue_cnt; +} +#endif /* !PSIO */ +/*----------------------------------------------------------------------------*/ +#ifndef DISABLE_DPDK +/** + * returns max numa ID while probing for rte devices + */ +static int probe_all_rte_devices(char **argv, int *argc, char *dev_name_list) +{ + PciDevice pd; + int fd, numa_id = -1; + static char end[] = ""; + static const char delim[] = " \t"; + static char *dev_tokenizer; + char *dev_token, *saveptr; + + dev_tokenizer = strdup(dev_name_list); + if (dev_tokenizer == NULL) { + TRACE_ERROR("Can't allocate memory for dev_tokenizer!\n"); + exit(EXIT_FAILURE); + } + fd = open(DEV_PATH, O_RDONLY); + if (fd != -1) { + dev_token = strtok_r(dev_tokenizer, delim, &saveptr); + while (dev_token != NULL) { + strcpy(pd.ifname, dev_token); + if (ioctl(fd, FETCH_PCI_ADDRESS, &pd) == -1) { + TRACE_DBG("Could not find pci info on dpdk " + "device: %s. Is it a dpdk-attached " + "interface?\n", + dev_token); + goto loop_over; + } + argv[*argc] = strdup("-w"); + argv[*argc + 1] = calloc(PCI_LENGTH, 1); + if (argv[*argc] == NULL || argv[*argc + 1] == NULL) { + TRACE_ERROR("Memory allocation error!\n"); + exit(EXIT_FAILURE); + } + sprintf(argv[*argc + 1], PCI_DOM ":" PCI_BUS ":" PCI_DEVICE "." PCI_FUNC, pd.pa.domain, pd.pa.bus, + pd.pa.device, pd.pa.function); + *argc += 2; + if (pd.numa_socket > numa_id) + numa_id = pd.numa_socket; +loop_over: + dev_token = strtok_r(NULL, delim, &saveptr); + } + close(fd); + free(dev_tokenizer); + } else { + TRACE_ERROR("Error opening dpdk-face!\n"); + exit(EXIT_FAILURE); + } + + /* add the terminating "" sequence */ + argv[*argc] = end; + + return numa_id; +} +#endif /* !DISABLE_DPDK */ +/*----------------------------------------------------------------------------*/ +int SetNetEnv(char *dev_name_list, char *port_stat_list) +{ + int eidx = 0; + int i, j; + + int set_all_inf = (strncmp(dev_name_list, ALL_STRING, sizeof(ALL_STRING)) == 0); + + TRACE_CONFIG("Loading interface setting\n"); + + CONFIG.eths = (struct eth_table *)calloc(MAX_DEVICES, sizeof(struct eth_table)); + if (!CONFIG.eths) { + TRACE_ERROR("Can't allocate space for CONFIG.eths\n"); + exit(EXIT_FAILURE); + } + + if (current_iomodule_func == &ps_module_func) { +#ifndef DISABLE_PSIO + struct ifreq ifr; + /* calculate num_devices now! */ + num_devices = ps_list_devices(devices); + if (num_devices == -1) { + perror("ps_list_devices"); + exit(EXIT_FAILURE); + } + + /* Create socket */ + int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock == -1) { + TRACE_ERROR("socket"); + exit(EXIT_FAILURE); + } + + /* To Do: Parse dev_name_list rather than use strstr */ + for (i = 0; i < num_devices; i++) { + strcpy(ifr.ifr_name, devices[i].name); + + /* getting interface information */ + if (ioctl(sock, SIOCGIFFLAGS, &ifr) == 0) { + if (!set_all_inf && strstr(dev_name_list, ifr.ifr_name) == NULL) + continue; + + /* Setting informations */ + eidx = CONFIG.eths_num++; + strcpy(CONFIG.eths[eidx].dev_name, ifr.ifr_name); + CONFIG.eths[eidx].ifindex = devices[i].ifindex; + + /* getting address */ + if (ioctl(sock, SIOCGIFADDR, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin; + } + + if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0) { + for (j = 0; j < ETH_ALEN; j++) { + CONFIG.eths[eidx].haddr[j] = ifr.ifr_addr.sa_data[j]; + } + } + + /* Net MASK */ + if (ioctl(sock, SIOCGIFNETMASK, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].netmask = *(uint32_t *)&sin; + } + + /* add to attached devices */ + for (j = 0; j < num_devices_attached; j++) { + if (devices_attached[j] == devices[i].ifindex) { + break; + } + } + devices_attached[num_devices_attached] = devices[i].ifindex; + num_devices_attached++; + + } else { + perror("SIOCGIFFLAGS"); + } + } + num_queues = GetNumQueues(); + if (num_queues <= 0) { + TRACE_CONFIG("Failed to find NIC queues!\n"); + close(sock); + return -1; + } + if (num_queues > num_cpus) { + TRACE_CONFIG("Too many NIC queues available.\n"); + close(sock); + return -1; + } + close(sock); +#endif /* !PSIO_MODULE */ + } else if (current_iomodule_func == &dpdk_module_func) { +#ifndef DISABLE_DPDK + int cpu = CONFIG.num_cores; + mpz_t _cpumask; + char cpumaskbuf[32] = ""; + char mem_channels[8] = ""; + char socket_mem_str[32] = ""; + // int i; + int ret, socket_mem; +#if RTE_VERSION < RTE_VERSION_NUM(19, 8, 0, 0) + static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; +#else + static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; +#endif + + /* STEP 1: first determine CPU mask */ + mpz_init(_cpumask); + + if (!mpz_cmp(_cpumask, CONFIG._cpumask)) { + /* get the cpu mask */ + for (ret = 0; ret < cpu; ret++) + mpz_setbit(_cpumask, ret); + + gmp_sprintf(cpumaskbuf, "%ZX", _cpumask); + } else + gmp_sprintf(cpumaskbuf, "%ZX", CONFIG._cpumask); + + mpz_clear(_cpumask); + + /* STEP 2: determine memory channels per socket */ + /* get the mem channels per socket */ + if (CONFIG.num_mem_ch == 0) { + TRACE_ERROR("DPDK module requires # of memory channels " + "per socket parameter!\n"); + exit(EXIT_FAILURE); + } + sprintf(mem_channels, "%d", CONFIG.num_mem_ch); + + /* STEP 3: determine socket memory */ + /* get socket memory threshold (in MB) */ + socket_mem = RTE_ALIGN_CEIL((unsigned long)ceil((CONFIG.num_cores * + (CONFIG.rcvbuf_size + CONFIG.sndbuf_size + sizeof(struct tcp_stream) + + sizeof(struct tcp_recv_vars) + sizeof(struct tcp_send_vars) + + sizeof(struct fragment_ctx)) * + CONFIG.max_concurrency) / + RTE_SOCKET_MEM_SHIFT), + RTE_CACHE_LINE_SIZE); + + /* initialize the rte env, what a waste of implementation effort! */ + int argc = 6; //8; + char *argv[RTE_ARGC_MAX] = { "", + "-c", + cpumaskbuf, + "-n", + mem_channels, +#if 0 + "--socket-mem", + socket_mem_str, +#endif + "--proc-type=auto" }; + ret = probe_all_rte_devices(argv, &argc, dev_name_list); + + /* STEP 4: build up socket mem parameter */ + sprintf(socket_mem_str, "%d", socket_mem); +#if 0 + char *smsptr = socket_mem_str + strlen(socket_mem_str); + for (i = 1; i < ret + 1; i++) { + sprintf(smsptr, ",%d", socket_mem); + smsptr += strlen(smsptr); + } + TRACE_DBG("socket_mem: %s\n", socket_mem_str); +#endif + /* + * re-set getopt extern variable optind. + * this issue was a bitch to debug + * rte_eal_init() internally uses getopt() syscall + * mtcp applications that also use an `external' getopt + * will cause a violent crash if optind is not reset to zero + * prior to calling the func below... + * see man getopt(3) for more details + */ + optind = 0; + +#ifdef DEBUG + /* print argv's */ + for (i = 0; i < argc; i++) + TRACE_INFO("argv[%d]: %s\n", i, argv[i]); +#endif + /* initialize the dpdk eal env */ + ret = rte_eal_init(argc, argv); + if (ret < 0) { + TRACE_ERROR("Invalid EAL args!\n"); + exit(EXIT_FAILURE); + } + /* give me the count of 'detected' ethernet ports */ +#if RTE_VERSION < RTE_VERSION_NUM(18, 5, 0, 0) + num_devices = rte_eth_dev_count(); +#else + num_devices = rte_eth_dev_count_avail(); +#endif + if (num_devices == 0) { + TRACE_ERROR("No Ethernet port!\n"); + exit(EXIT_FAILURE); + } + + /* get mac addr entries of 'detected' dpdk ports */ + for (ret = 0; ret < num_devices; ret++) + rte_eth_macaddr_get(ret, &ports_eth_addr[ret]); + + num_queues = MIN(CONFIG.num_cores, MAX_CPUS); + + struct ifaddrs *ifap; + struct ifaddrs *iter_if; + char *seek; + + if (getifaddrs(&ifap) != 0) { + perror("getifaddrs: "); + exit(EXIT_FAILURE); + } + + iter_if = ifap; + do { + if (iter_if->ifa_addr && iter_if->ifa_addr->sa_family == AF_INET && !set_all_inf && + (seek = strstr(dev_name_list, iter_if->ifa_name)) != NULL && + /* check if the interface was not aliased */ + *(seek + strlen(iter_if->ifa_name)) != ':') { + struct ifreq ifr; + + /* Setting informations */ + eidx = CONFIG.eths_num++; + strcpy(CONFIG.eths[eidx].dev_name, iter_if->ifa_name); + strcpy(ifr.ifr_name, iter_if->ifa_name); + + /* Create socket */ + int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock == -1) { + perror("socket"); + exit(EXIT_FAILURE); + } + + /* getting address */ + if (ioctl(sock, SIOCGIFADDR, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin; + } + + if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0) { + for (j = 0; j < ETH_ALEN; j++) { + CONFIG.eths[eidx].haddr[j] = ifr.ifr_addr.sa_data[j]; + } + } + + /* Net MASK */ + if (ioctl(sock, SIOCGIFNETMASK, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].netmask = *(uint32_t *)&sin; + } + close(sock); + + for (j = 0; j < num_devices; j++) { + if (!memcmp(&CONFIG.eths[eidx].haddr[0], &ports_eth_addr[j], ETH_ALEN)) + CONFIG.eths[eidx].ifindex = j; + } + + /* add to attached devices */ + for (j = 0; j < num_devices_attached; j++) { + if (devices_attached[j] == CONFIG.eths[eidx].ifindex) { + break; + } + } + devices_attached[num_devices_attached] = CONFIG.eths[eidx].ifindex; + num_devices_attached++; + fprintf(stderr, "Total number of attached devices: %d\n", num_devices_attached); + fprintf(stderr, "Interface name: %s\n", iter_if->ifa_name); + } + iter_if = iter_if->ifa_next; + } while (iter_if != NULL); + + freeifaddrs(ifap); +#if 0 + /* + * XXX: It seems that there is a bug in the RTE SDK. + * The dynamically allocated rte_argv params are left + * as dangling pointers. Freeing them causes program + * to crash. + */ + + /* free up all resources */ + for (; rte_argc >= 9; rte_argc--) { + if (rte_argv[rte_argc] != NULL) { + fprintf(stderr, "Cleaning up rte_argv[%d]: %s (%p)\n", + rte_argc, rte_argv[rte_argc], rte_argv[rte_argc]); + free(rte_argv[rte_argc]); + rte_argv[rte_argc] = NULL; + } + } +#endif + /* check if process is primary or secondary */ + CONFIG.multi_process_is_master = (eal_proc_type_detect() == RTE_PROC_PRIMARY) ? 1 : 0; + +#endif /* !DISABLE_DPDK */ +#ifndef DISABLE_NETMAP + } else if (current_iomodule_func == &netmap_module_func) +#endif +#ifndef DISABLE_AFXDP +} +else if (current_iomodule_func == &afxdp_module_func) +#endif +{ +#ifndef DISABLE_AFXDP //m-> come back here to handle NETMAP as well + struct ifaddrs *ifap; + struct ifaddrs *iter_if; + char *seek; + + num_queues = MIN(CONFIG.num_cores, MAX_CPUS); + + if (getifaddrs(&ifap) != 0) { + perror("getifaddrs: "); + exit(EXIT_FAILURE); + } + + iter_if = ifap; + do { + if (iter_if->ifa_addr && iter_if->ifa_addr->sa_family == AF_INET && !set_all_inf && + (seek = strstr(dev_name_list, iter_if->ifa_name)) != NULL && + /* check if the interface was not aliased */ + *(seek + strlen(iter_if->ifa_name)) != ':') { + struct ifreq ifr; + + /* Setting informations */ + eidx = CONFIG.eths_num++; + strcpy(CONFIG.eths[eidx].dev_name, iter_if->ifa_name); + strcpy(ifr.ifr_name, iter_if->ifa_name); + + /* Create socket */ + int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock == -1) { + perror("socket"); + exit(EXIT_FAILURE); + } + + /* getting address */ + if (ioctl(sock, SIOCGIFADDR, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin; + } + + if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0) { + for (j = 0; j < ETH_ALEN; j++) { + CONFIG.eths[eidx].haddr[j] = ifr.ifr_addr.sa_data[j]; + } + } + + /* Net MASK */ + if (ioctl(sock, SIOCGIFNETMASK, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].netmask = *(uint32_t *)&sin; + } + close(sock); +#if 0 + for (j = 0; j < num_devices; j++) { + if (!memcmp(&CONFIG.eths[eidx].haddr[0], &ports_eth_addr[j], + ETH_ALEN)) + CONFIG.eths[eidx].ifindex = ifr.ifr_ifindex; +#endif + CONFIG.eths[eidx].ifindex = eidx; + TRACE_INFO("Ifindex of interface %s is: %d\n", ifr.ifr_name, CONFIG.eths[eidx].ifindex); +#if 0 + } +#endif + + /* add to attached devices */ + for (j = 0; j < num_devices_attached; j++) { + if (devices_attached[j] == CONFIG.eths[eidx].ifindex) { + break; + } + } + devices_attached[num_devices_attached] = if_nametoindex(ifr.ifr_name); + num_devices_attached++; + fprintf(stderr, "Total number of attached devices: %d\n", num_devices_attached); + fprintf(stderr, "Interface name: %s\n", iter_if->ifa_name); + } + iter_if = iter_if->ifa_next; + } while (iter_if != NULL); + + freeifaddrs(ifap); +#endif /* !DISABLE_NETMAP */ +} +else if (current_iomodule_func == &onvm_module_func) +{ +#ifdef ENABLE_ONVM + int cpu = CONFIG.num_cores; + mpz_t cpumask; + char cpumaskbuf[32]; + char mem_channels[8]; + char service[6]; + char instance[6]; + int ret; + + mpz_init(cpumask); + + /* get the cpu mask */ + for (ret = 0; ret < cpu; ret++) + mpz_setbit(cpumask, ret); + gmp_sprintf(cpumaskbuf, "%ZX", cpumask); + + mpz_clear(cpumask); + + /* get the mem channels per socket */ + if (CONFIG.num_mem_ch == 0) { + TRACE_ERROR("DPDK module requires # of memory channels " + "per socket parameter!\n"); + exit(EXIT_FAILURE); + } + sprintf(mem_channels, "%d", CONFIG.num_mem_ch); + sprintf(service, "%d", CONFIG.onvm_serv); + sprintf(instance, "%d", CONFIG.onvm_inst); + + /* initialize the rte env first, what a waste of implementation effort! */ + char *argv[] = { "", "-c", cpumaskbuf, "-n", mem_channels, "--proc-type=secondary", "--", "-r", service, instance, "" }; + + const int argc = 10; + + /* + * re-set getopt extern variable optind. + * this issue was a bitch to debug + * rte_eal_init() internally uses getopt() syscall + * mtcp applications that also use an `external' getopt + * will cause a violent crash if optind is not reset to zero + * prior to calling the func below... + * see man getopt(3) for more details + */ + optind = 0; + + /* Initialize onvm */ + CONFIG.nf_local_ctx = onvm_nflib_init_nf_local_ctx(); + ret = onvm_nflib_init(argc, argv, "mtcp_nf", CONFIG.nf_local_ctx, NULL); + if (ret < 0) { + TRACE_ERROR("Invalid EAL args!\n"); + exit(EXIT_FAILURE); + } + /* give me the count of 'detected' ethernet ports */ + num_devices = ports->num_ports; + if (num_devices == 0) { + TRACE_ERROR("No Ethernet port!\n"); + exit(EXIT_FAILURE); + } + + num_queues = MIN(CONFIG.num_cores, MAX_CPUS); + + struct ifaddrs *ifap; + struct ifaddrs *iter_if; + char *seek; + + if (getifaddrs(&ifap) != 0) { + perror("getifaddrs: "); + exit(EXIT_FAILURE); + } + + iter_if = ifap; + do { + if (iter_if->ifa_addr && iter_if->ifa_addr->sa_family == AF_INET && !set_all_inf && + (seek = strstr(dev_name_list, iter_if->ifa_name)) != NULL && + /* check if the interface was not aliased */ + *(seek + strlen(iter_if->ifa_name)) != ':') { + struct ifreq ifr; + + /* Setting informations */ + eidx = CONFIG.eths_num++; + strcpy(CONFIG.eths[eidx].dev_name, iter_if->ifa_name); + strcpy(ifr.ifr_name, iter_if->ifa_name); + + /* Create socket */ + int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock == -1) { + perror("socket"); + } + + /* getting address */ + if (ioctl(sock, SIOCGIFADDR, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin; + } + + for (j = 0; j < ETH_ALEN; j++) { + CONFIG.eths[eidx].haddr[j] = ports->mac[eidx].addr_bytes[j]; + }; + + /* Net MASK */ + if (ioctl(sock, SIOCGIFNETMASK, &ifr) == 0) { + struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + CONFIG.eths[eidx].netmask = *(uint32_t *)&sin; + } + close(sock); + + CONFIG.eths[eidx].ifindex = ports->id[eidx]; + devices_attached[num_devices_attached] = CONFIG.eths[eidx].ifindex; + num_devices_attached++; + fprintf(stderr, "Total number of attached devices: %d\n", num_devices_attached); + fprintf(stderr, "Interface name: %s\n", iter_if->ifa_name); + } + iter_if = iter_if->ifa_next; + } while (iter_if != NULL); + + freeifaddrs(ifap); +#endif /* ENABLE_ONVM */ +} + +CONFIG.nif_to_eidx = (int *)calloc(MAX_DEVICES, sizeof(int)); + +if (!CONFIG.nif_to_eidx) { + exit(EXIT_FAILURE); +} + +for (i = 0; i < MAX_DEVICES; ++i) { + CONFIG.nif_to_eidx[i] = -1; +} + +for (i = 0; i < CONFIG.eths_num; ++i) { + j = CONFIG.eths[i].ifindex; + if (j >= MAX_DEVICES) { + TRACE_ERROR("ifindex of eths_%d exceed the limit: %d\n", i, j); + exit(EXIT_FAILURE); + } + + /* the physic port index of the i-th port listed in the config file is j*/ + CONFIG.nif_to_eidx[j] = i; + + /* finally set the port stats option `on' */ + if (strstr(port_stat_list, CONFIG.eths[i].dev_name) != 0) + CONFIG.eths[i].stat_print = TRUE; +} + +return 0; +} +/*----------------------------------------------------------------------------*/ +int FetchEndianType(void) +{ +#ifndef DISABLE_DPDK + char *argv; + char **argp = &argv; + /* dpdk_module_func/onvm_module_func logic down below */ + if (current_iomodule_func == &dpdk_module_func) { + (*current_iomodule_func).dev_ioctl(NULL, CONFIG.eths[0].ifindex, DRV_NAME, (void *)argp); + if (!strcmp(*argp, "net_i40e")) + return 1; + } +#endif + return 0; +} +/*----------------------------------------------------------------------------*/ +int CheckIOModuleAccessPermissions(void) +{ +#ifndef DISABLE_NETMAP + int fd; + /* check if netmap module can access I/O with sudo privileges */ + if (current_iomodule_func == &netmap_module_func) { + fd = open(NETMAP_DEVICE_NAME, O_RDONLY); + if (fd != -1) + close(fd); + return fd; + } +#endif //DISABLE_NETMAP + + /* sudo privileges are definitely needed otherwise */ + if (geteuid()) + return -1; + + return 0; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/ip_in.c b/lib/flash/mtcp/ip_in.c new file mode 100644 index 0000000..5072f53 --- /dev/null +++ b/lib/flash/mtcp/ip_in.c @@ -0,0 +1,92 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ip_in.h" +#include "tcp_in.h" +#include "mtcp_api.h" +#include "ps.h" +#include "debug.h" +#include "icmp.h" + +#define ETH_P_IP_FRAG 0xF800 +#define ETH_P_IPV6_FRAG 0xF6DD + +/*----------------------------------------------------------------------------*/ +inline int ProcessIPv4Packet(mtcp_manager_t mtcp, uint32_t cur_ts, const int ifidx, unsigned char *pkt_data, int len) +{ + (void)len; + /* check and process IPv4 packets */ + struct iphdr *iph = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); + int ip_len = ntohs(iph->tot_len); + int rc = -1; + + /* drop the packet shorter than ip header */ + if (ip_len < (int)sizeof(struct iphdr)) + return ERROR; + +#ifndef DISABLE_HWCSUM + if (mtcp->iom->dev_ioctl != NULL) + rc = mtcp->iom->dev_ioctl(mtcp->ctx, ifidx, PKT_RX_IP_CSUM, iph); + if (rc == -1 && ip_fast_csum(iph, iph->ihl)) + return ERROR; +#else + UNUSED(rc); + if (ip_fast_csum(iph, iph->ihl)) + return ERROR; +#endif + +#if !PROMISCUOUS_MODE + /* if not promiscuous mode, drop if the destination is not myself */ + if (iph->daddr != CONFIG.eths[ifidx].ip_addr) + //DumpIPPacketToFile(stderr, iph, ip_len); + return TRUE; +#endif + + // see if the version is correct + if (iph->version != 0x4) { + return FALSE; + } + + switch (iph->protocol) { + case IPPROTO_TCP: + return ProcessTCPPacket(mtcp, cur_ts, ifidx, iph, ip_len); + case IPPROTO_ICMP: + return ProcessICMPPacket(mtcp, iph, ip_len); + default: + /* currently drop other protocols */ + return FALSE; + } + return FALSE; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/ip_out.c b/lib/flash/mtcp/ip_out.c new file mode 100644 index 0000000..04af790 --- /dev/null +++ b/lib/flash/mtcp/ip_out.c @@ -0,0 +1,199 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ip_out.h" +#include "ip_in.h" +#include "eth_out.h" +#include "arp.h" +#include "debug.h" + +/*----------------------------------------------------------------------------*/ +inline int GetOutputInterface(uint32_t daddr, uint8_t *is_external) +{ + int nif = -1; + int i; + int prefix = 0; + + *is_external = 0; + /* Longest prefix matching */ + for (i = 0; i < CONFIG.routes; i++) { + if ((daddr & CONFIG.rtable[i].mask) == CONFIG.rtable[i].masked) { + if (CONFIG.rtable[i].prefix > prefix) { + nif = CONFIG.rtable[i].nif; + prefix = CONFIG.rtable[i].prefix; + } else if (CONFIG.gateway) { + *is_external = 1; + nif = (CONFIG.gateway)->nif; + } + break; + } + } + + if (nif < 0) { + uint8_t *da = (uint8_t *)&daddr; + TRACE_ERROR("[WARNING] No route to %u.%u.%u.%u\n", da[0], da[1], da[2], da[3]); + assert(0); + } + + return nif; +} +/*----------------------------------------------------------------------------*/ +uint8_t *IPOutputStandalone(struct mtcp_manager *mtcp, uint8_t protocol, uint16_t ip_id, uint32_t saddr, uint32_t daddr, + uint16_t payloadlen) +{ + struct iphdr *iph; + int nif; + unsigned char *haddr, is_external; + int rc = -1; + + nif = GetOutputInterface(daddr, &is_external); + if (nif < 0) + return NULL; + + haddr = GetDestinationHWaddr(daddr, is_external); + if (!haddr) { +#if 0 + uint8_t *da = (uint8_t *)&daddr; + TRACE_INFO("[WARNING] The destination IP %u.%u.%u.%u " + "is not in ARP table!\n", + da[0], da[1], da[2], da[3]); +#endif + RequestARP(mtcp, (is_external) ? ((CONFIG.gateway)->daddr) : daddr, nif, mtcp->cur_ts); + return NULL; + } + + iph = (struct iphdr *)EthernetOutput(mtcp, ETH_P_IP, nif, haddr, payloadlen + IP_HEADER_LEN); + if (!iph) { + return NULL; + } + + iph->ihl = IP_HEADER_LEN >> 2; + iph->version = 4; + iph->tos = 0; + iph->tot_len = htons(IP_HEADER_LEN + payloadlen); + iph->id = htons(ip_id); + iph->frag_off = htons(IP_DF); // no fragmentation + iph->ttl = 64; + iph->protocol = protocol; + iph->saddr = saddr; + iph->daddr = daddr; + iph->check = 0; + +#ifndef DISABLE_HWCSUM + if (mtcp->iom->dev_ioctl != NULL) { + switch (iph->protocol) { + case IPPROTO_TCP: + rc = mtcp->iom->dev_ioctl(mtcp->ctx, nif, PKT_TX_TCPIP_CSUM_PEEK, iph); + break; + case IPPROTO_ICMP: + rc = mtcp->iom->dev_ioctl(mtcp->ctx, nif, PKT_TX_IP_CSUM, iph); + break; + } + } + /* otherwise calculate IP checksum in S/W */ + if (rc == -1) + iph->check = ip_fast_csum(iph, iph->ihl); +#else + UNUSED(rc); + iph->check = ip_fast_csum(iph, iph->ihl); +#endif + + return (uint8_t *)(iph + 1); +} +/*----------------------------------------------------------------------------*/ +uint8_t *IPOutput(struct mtcp_manager *mtcp, tcp_stream *stream, uint16_t tcplen) +{ + struct iphdr *iph; + int nif; + unsigned char *haddr, is_external = 0; + int rc = -1; + + if (stream->sndvar->nif_out >= 0) { + nif = stream->sndvar->nif_out; + } else { + nif = GetOutputInterface(stream->daddr, &is_external); + stream->sndvar->nif_out = nif; + stream->is_external = is_external; + } + + haddr = GetDestinationHWaddr(stream->daddr, stream->is_external); + if (!haddr) { +#if 0 + uint8_t *da = (uint8_t *)&stream->daddr; + TRACE_INFO("[WARNING] The destination IP %u.%u.%u.%u " + "is not in ARP table!\n", + da[0], da[1], da[2], da[3]); +#endif + /* if not found in the arp table, send arp request and return NULL */ + /* tcp will retry sending the packet later */ + RequestARP(mtcp, (stream->is_external) ? (CONFIG.gateway)->daddr : stream->daddr, stream->sndvar->nif_out, + mtcp->cur_ts); + return NULL; + } + + iph = (struct iphdr *)EthernetOutput(mtcp, ETH_P_IP, stream->sndvar->nif_out, haddr, tcplen + IP_HEADER_LEN); + if (!iph) { + return NULL; + } + + iph->ihl = IP_HEADER_LEN >> 2; + iph->version = 4; + iph->tos = 0; + iph->tot_len = htons(IP_HEADER_LEN + tcplen); + iph->id = htons(stream->sndvar->ip_id++); + iph->frag_off = htons(0x4000); // no fragmentation + iph->ttl = 64; + iph->protocol = IPPROTO_TCP; + iph->saddr = stream->saddr; + iph->daddr = stream->daddr; + iph->check = 0; + +#ifndef DISABLE_HWCSUM + /* offload IP checkum if possible */ + if (mtcp->iom->dev_ioctl != NULL) { + switch (iph->protocol) { + case IPPROTO_TCP: + rc = mtcp->iom->dev_ioctl(mtcp->ctx, nif, PKT_TX_TCPIP_CSUM_PEEK, iph); + break; + case IPPROTO_ICMP: + rc = mtcp->iom->dev_ioctl(mtcp->ctx, nif, PKT_TX_IP_CSUM, iph); + break; + } + } + /* otherwise calculate IP checksum in S/W */ + if (rc == -1) + iph->check = ip_fast_csum(iph, iph->ihl); +#else + UNUSED(rc); + iph->check = ip_fast_csum(iph, iph->ihl); +#endif + return (uint8_t *)(iph + 1); +} diff --git a/lib/flash/mtcp/logger.c b/lib/flash/mtcp/logger.c new file mode 100644 index 0000000..1484fbb --- /dev/null +++ b/lib/flash/mtcp/logger.c @@ -0,0 +1,193 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cpu.h" +#include "debug.h" +#include "logger.h" + +/*----------------------------------------------------------------------------*/ +static void EnqueueFreeBuffer(log_thread_context *ctx, log_buff *free_bp) +{ + pthread_mutex_lock(&ctx->free_mutex); + TAILQ_INSERT_TAIL(&ctx->free_queue, free_bp, buff_link); + ctx->free_buff_cnt++; + + assert(ctx->free_buff_cnt <= NUM_LOG_BUFF); + assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF); + pthread_mutex_unlock(&ctx->free_mutex); +} +/*----------------------------------------------------------------------------*/ +log_buff *DequeueFreeBuffer(log_thread_context *ctx) +{ + pthread_mutex_lock(&ctx->free_mutex); + log_buff *free_bp = TAILQ_FIRST(&ctx->free_queue); + if (free_bp) { + TAILQ_REMOVE(&ctx->free_queue, free_bp, buff_link); + ctx->free_buff_cnt--; + } + + assert(ctx->free_buff_cnt >= 0); + assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF); + pthread_mutex_unlock(&ctx->free_mutex); + return (free_bp); +} +/*----------------------------------------------------------------------------*/ +void EnqueueJobBuffer(log_thread_context *ctx, log_buff *working_bp) +{ + TAILQ_INSERT_TAIL(&ctx->working_queue, working_bp, buff_link); + ctx->job_buff_cnt++; + ctx->state = ACTIVE_LOGT; + assert(ctx->job_buff_cnt <= NUM_LOG_BUFF); + if (ctx->free_buff_cnt + ctx->job_buff_cnt > NUM_LOG_BUFF) { + TRACE_ERROR("free_buff_cnt(%d) + job_buff_cnt(%d) > NUM_LOG_BUFF(%d)\n", ctx->free_buff_cnt, ctx->job_buff_cnt, + NUM_LOG_BUFF); + } + assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF); +} +/*----------------------------------------------------------------------------*/ +static log_buff *DequeueJobBuffer(log_thread_context *ctx) +{ + pthread_mutex_lock(&ctx->mutex); + log_buff *working_bp = TAILQ_FIRST(&ctx->working_queue); + if (working_bp) { + TAILQ_REMOVE(&ctx->working_queue, working_bp, buff_link); + ctx->job_buff_cnt--; + } else { + ctx->state = IDLE_LOGT; + } + + assert(ctx->job_buff_cnt >= 0); + assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF); + pthread_mutex_unlock(&ctx->mutex); + return (working_bp); +} +/*----------------------------------------------------------------------------*/ +void InitLogThreadContext(struct log_thread_context *ctx, int cpu) +{ + int i; + int sv[2]; + + /* initialize log_thread_context */ + memset(ctx, 0, sizeof(struct log_thread_context)); + ctx->cpu = cpu; + ctx->state = IDLE_LOGT; + ctx->done = 0; + + if (pipe(sv)) { + fprintf(stderr, "pipe() failed, errno=%d, errstr=%s\n", errno, strerror(errno)); + exit(1); + } + ctx->sp_fd = sv[0]; + ctx->pair_sp_fd = sv[1]; + + pthread_mutex_init(&ctx->mutex, NULL); + pthread_mutex_init(&ctx->free_mutex, NULL); + + TAILQ_INIT(&ctx->working_queue); + TAILQ_INIT(&ctx->free_queue); + + /* initialize free log_buff */ + log_buff *w_buff = malloc(sizeof(log_buff) * NUM_LOG_BUFF); + assert(w_buff); + for (i = 0; i < NUM_LOG_BUFF; i++) { + EnqueueFreeBuffer(ctx, &w_buff[i]); + } +} +/*----------------------------------------------------------------------------*/ +void *ThreadLogMain(void *arg) +{ + size_t len; + log_thread_context *ctx = (log_thread_context *)arg; + log_buff *w_buff; + int cnt; + + mtcp_core_affinitize(ctx->cpu); + //fprintf(stderr, "[CPU %d] Log thread created. thread: %lu\n", + // ctx->cpu, pthread_self()); + + TRACE_LOG("Log thread %d is starting.\n", ctx->cpu); + + while (!ctx->done) { + /* handle every jobs in job buffer*/ + cnt = 0; + while ((w_buff = DequeueJobBuffer(ctx))) { + if (++cnt > NUM_LOG_BUFF) { + TRACE_ERROR("CPU %d: Exceed NUM_LOG_BUFF %d.\n", ctx->cpu, cnt); + break; + } + len = fwrite(w_buff->buff, 1, w_buff->buff_len, w_buff->fid); + if ((int)len != w_buff->buff_len) { + TRACE_ERROR("CPU %d: Tried to write %d, but only write %ld\n", ctx->cpu, w_buff->buff_len, len); + } + //assert(len == w_buff->buff_len); + EnqueueFreeBuffer(ctx, w_buff); + } + + /* */ + while (ctx->state == IDLE_LOGT && !ctx->done) { + char temp[1]; + int ret = read(ctx->sp_fd, temp, 1); + if (ret) + break; + } + } + + TRACE_LOG("Log thread %d out of first loop.\n", ctx->cpu); + /* handle every jobs in job buffer*/ + cnt = 0; + while ((w_buff = DequeueJobBuffer(ctx))) { + if (++cnt > NUM_LOG_BUFF) { + TRACE_ERROR("CPU %d: " + "Exceed NUM_LOG_BUFF %d in final loop.\n", + ctx->cpu, cnt); + break; + } + len = fwrite(w_buff->buff, 1, w_buff->buff_len, w_buff->fid); + assert(len == w_buff->buff_len); + EnqueueFreeBuffer(ctx, w_buff); + } + + TRACE_LOG("Log thread %d finished.\n", ctx->cpu); + pthread_exit(NULL); + + return NULL; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/memory_mgt.c b/lib/flash/mtcp/memory_mgt.c new file mode 100644 index 0000000..c27c9c1 --- /dev/null +++ b/lib/flash/mtcp/memory_mgt.c @@ -0,0 +1,232 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #include +#include +#include +#include +#include +#include +#include +#include "debug.h" +#include "memory_mgt.h" +/*----------------------------------------------------------------------------*/ +typedef struct tag_mem_chunk { + int mc_free_chunks; + struct tag_mem_chunk *mc_next; +} mem_chunk; +/*----------------------------------------------------------------------------*/ +typedef mem_chunk *mem_chunk_t; +/*----------------------------------------------------------------------------*/ +#if defined(DISABLE_DPDK) || defined(ENABLE_ONVM) +typedef struct mem_pool { + u_char *mp_startptr; /* start pointer */ + mem_chunk_t mp_freeptr; /* pointer to the start memory chunk */ + int mp_free_chunks; /* number of total free chunks */ + int mp_total_chunks; /* number of total free chunks */ + int mp_chunk_size; /* chunk size in bytes */ + int mp_type; + +} mem_pool; +/*----------------------------------------------------------------------------*/ +mem_pool *MPCreate(int chunk_size, size_t total_size) +{ + mem_pool_t mp; + + if (chunk_size < (int)sizeof(mem_chunk)) { + TRACE_ERROR("The chunk size should be larger than %lu. current: %d\n", sizeof(mem_chunk), chunk_size); + return NULL; + } + if (chunk_size % 4 != 0) { + TRACE_ERROR("The chunk size should be multiply of 4!\n"); + return NULL; + } + + //assert(chunk_size <= 2*1024*1024); + + if ((mp = calloc(1, sizeof(mem_pool))) == NULL) { + perror("calloc failed"); + exit(0); + } + mp->mp_type = 0; + mp->mp_chunk_size = chunk_size; + mp->mp_free_chunks = ((total_size + (chunk_size - 1)) / chunk_size); + mp->mp_total_chunks = mp->mp_free_chunks; + total_size = chunk_size * ((size_t)mp->mp_free_chunks); + + /* allocate the big memory chunk */ + int res = posix_memalign((void **)&mp->mp_startptr, getpagesize(), total_size); + if (res != 0) { + TRACE_ERROR("posix_memalign failed, size=%ld\n", total_size); + assert(0); + free(mp); + return (NULL); + } + + /* try mlock only for superuser */ + if (geteuid() == 0) { + if (mlock(mp->mp_startptr, total_size) < 0) + TRACE_ERROR("m_lock failed, size=%ld\n", total_size); + } + + mp->mp_freeptr = (mem_chunk_t)mp->mp_startptr; + mp->mp_freeptr->mc_free_chunks = mp->mp_free_chunks; + mp->mp_freeptr->mc_next = NULL; + + return mp; +} +/*----------------------------------------------------------------------------*/ +void *MPAllocateChunk(mem_pool_t mp) +{ + mem_chunk_t p = mp->mp_freeptr; + + if (mp->mp_free_chunks == 0) + return (NULL); + assert(p->mc_free_chunks > 0 && p->mc_free_chunks <= p->mc_free_chunks); + + p->mc_free_chunks--; + mp->mp_free_chunks--; + if (p->mc_free_chunks) { + /* move right by one chunk */ + mp->mp_freeptr = (mem_chunk_t)((u_char *)p + mp->mp_chunk_size); + mp->mp_freeptr->mc_free_chunks = p->mc_free_chunks; + mp->mp_freeptr->mc_next = p->mc_next; + } else { + mp->mp_freeptr = p->mc_next; + } + + return p; +} +/*----------------------------------------------------------------------------*/ +void MPFreeChunk(mem_pool_t mp, void *p) +{ + mem_chunk_t mcp = (mem_chunk_t)p; + + // assert((u_char*)p >= mp->mp_startptr && + // (u_char *)p < mp->mp_startptr + mp->mp_total_size); + assert(((u_char *)p - mp->mp_startptr) % mp->mp_chunk_size == 0); + // assert(*((u_char *)p + (mp->mp_chunk_size-1)) == 'a'); + // *((u_char *)p + (mp->mp_chunk_size-1)) = 'f'; + + mcp->mc_free_chunks = 1; + mcp->mc_next = mp->mp_freeptr; + mp->mp_freeptr = mcp; + mp->mp_free_chunks++; +} +/*----------------------------------------------------------------------------*/ +void MPDestroy(mem_pool_t mp) +{ + free(mp->mp_startptr); + free(mp); +} +/*----------------------------------------------------------------------------*/ +int MPGetFreeChunks(mem_pool_t mp) +{ + return mp->mp_free_chunks; +} +/*----------------------------------------------------------------------------*/ +// static uint32_t MPIsDanger(mem_pool_t mp) +// { +// #define DANGER_THRESHOLD 0.95 +// #define SAFE_THRESHOLD 0.90 +// uint32_t danger_num = mp->mp_total_chunks * DANGER_THRESHOLD; +// uint32_t safe_num = mp->mp_total_chunks * SAFE_THRESHOLD; +// if ((int)danger_num < mp->mp_total_chunks - mp->mp_free_chunks) { +// return mp->mp_total_chunks - mp->mp_free_chunks - safe_num; +// } +// return 0; +// } +// /*----------------------------------------------------------------------------*/ +// static uint32_t MPIsOverSafeline(mem_pool_t mp) +// { +// #define SAFELINE 0.90 +// uint32_t safe_num = mp->mp_total_chunks * SAFELINE; +// if ((int)safe_num < mp->mp_total_chunks - mp->mp_free_chunks) { +// return 1; +// } +// return 0; +// } +/*----------------------------------------------------------------------------*/ +#else +/*----------------------------------------------------------------------------*/ +mem_pool_t MPCreate(char *name, int chunk_size, size_t total_size) +{ + struct rte_mempool *mp; + size_t sz, items; + + items = total_size / chunk_size; + sz = RTE_ALIGN_CEIL(chunk_size, RTE_CACHE_LINE_SIZE); + mp = rte_mempool_create(name, items, sz, 0, 0, NULL, 0, NULL, 0, rte_socket_id(), MEMPOOL_F_NO_SPREAD); + + if (mp == NULL) { + TRACE_ERROR("Can't allocate memory for mempool!\n"); + exit(EXIT_FAILURE); + } + + return mp; +} +/*----------------------------------------------------------------------------*/ +void *MPAllocateChunk(mem_pool_t mp) +{ + int rc; + void *buf; + + rc = rte_mempool_get(mp, (void **)&buf); + if (rc != 0) + return NULL; + + return buf; +} +/*----------------------------------------------------------------------------*/ +void MPFreeChunk(mem_pool_t mp, void *p) +{ + rte_mempool_put(mp, p); +} +/*----------------------------------------------------------------------------*/ +void MPDestroy(mem_pool_t mp) +{ +#if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 0) + /* do nothing.. old versions don't have a method to reclaim back mem */ +#else + rte_mempool_free(mp); +#endif +} +/*----------------------------------------------------------------------------*/ +int MPGetFreeChunks(mem_pool_t mp) +{ +#if RTE_VERSION <= RTE_VERSION_NUM(16, 7, 0, 0) + return (int)rte_mempool_free_count(mp); +#else + return (int)rte_mempool_avail_count(mp); +#endif +} +/*----------------------------------------------------------------------------*/ +#endif diff --git a/lib/flash/mtcp/meson.build b/lib/flash/mtcp/meson.build new file mode 100644 index 0000000..4fc1fdc --- /dev/null +++ b/lib/flash/mtcp/meson.build @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Debojeet Das + +sources = files('core.c', + 'tcp_stream.c', + 'config.c', + 'api.c', + 'eventpoll.c', + 'socket.c', + 'pipe.c', + 'tcp_util.c', + 'eth_in.c', + 'ip_in.c', + 'tcp_in.c', + 'eth_out.c', + 'ip_out.c', + 'tcp_out.c', + 'arp.c', + 'timer.c', + 'cpu.c', + 'rss.c', + 'addr_pool.c', + 'fhash.c', + 'memory_mgt.c', + 'logger.c', + 'debug.c', + 'tcp_rb_frag_queue.c', + 'tcp_ring_buffer.c', + 'tcp_send_buffer.c', + 'tcp_sb_queue.c', + 'tcp_stream_queue.c', + 'psio_module.c', + 'io_module.c', + 'dpdk_module.c', + 'onvm_module.c', + 'icmp.c', + 'flash_module.c') + +headers = files('include/mtcp_api.h', 'include/mtcp_epoll.h') + +deps += [include, log, nf, params, uds] + +libmtcp = library(libname, sources, install: true, dependencies: deps, include_directories: include_directories('./include')) +mtcp = declare_dependency(link_with: libmtcp, include_directories: include_directories('./include')) + +flash_libs += mtcp diff --git a/lib/flash/mtcp/netmap_module.c b/lib/flash/mtcp/netmap_module.c new file mode 100644 index 0000000..3ed7587 --- /dev/null +++ b/lib/flash/mtcp/netmap_module.c @@ -0,0 +1,299 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* for io_module_func def'ns */ +#include "io_module.h" +#ifndef DISABLE_NETMAP +/* for mtcp related def'ns */ +#include "mtcp.h" +/* for errno */ +#include +/* for logging */ +#include "debug.h" +/* for num_devices_* */ +#include "config.h" +/* for netmap definitions */ +#define NETMAP_WITH_LIBS +#include "netmap_user.h" +/* for poll */ +#include +/* for ETHER_CRC_LEN */ +#include +/*----------------------------------------------------------------------------*/ +#define MAX_PKT_BURST 64 +#define ETHERNET_FRAME_SIZE 1514 +#define MAX_IFNAMELEN (IF_NAMESIZE + 10) +#define EXTRA_BUFS 512 +#define IDLE_POLL_WAIT 1 /* msecs */ +#define IDLE_POLL_COUNT 10 +//#define CONST_POLLING 1 + +/* + * Ethernet frame overhead + */ + +#define ETHER_IFG 12 +#define ETHER_PREAMBLE 8 +#define ETHER_OVR (ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG) +/*----------------------------------------------------------------------------*/ + +struct netmap_private_context { + struct nm_desc *local_nmd[MAX_DEVICES]; + unsigned char snd_pktbuf[MAX_DEVICES][ETHERNET_FRAME_SIZE]; + unsigned char *rcv_pktbuf[MAX_PKT_BURST]; + uint16_t rcv_pkt_len[MAX_PKT_BURST]; + uint16_t snd_pkt_size[MAX_DEVICES]; + uint8_t dev_poll_flag[MAX_DEVICES]; + uint8_t idle_poll_count; +} __attribute__((aligned(__WORDSIZE))); +/*----------------------------------------------------------------------------*/ +void netmap_init_handle(struct mtcp_thread_context *ctxt) +{ + struct netmap_private_context *npc; + char ifname[MAX_IFNAMELEN]; + char nifname[MAX_IFNAMELEN]; + int j; + + /* create and initialize private I/O module context */ + ctxt->io_private_context = calloc(1, sizeof(struct netmap_private_context)); + if (ctxt->io_private_context == NULL) { + TRACE_ERROR("Failed to initialize ctxt->io_private_context: " + "Can't allocate memory\n"); + exit(EXIT_FAILURE); + } + + npc = (struct netmap_private_context *)ctxt->io_private_context; + + /* initialize per-thread netmap interfaces */ + for (j = 0; j < num_devices_attached; j++) { + if (if_indextoname(devices_attached[j], ifname) == NULL) { + TRACE_ERROR("Failed to initialize interface %s with ifidx: %d - " + "error string: %s\n", + ifname, devices_attached[j], strerror(errno)); + exit(EXIT_FAILURE); + } + + if (unlikely(CONFIG.num_cores == 1)) + sprintf(nifname, "netmap:%s", ifname); + else + sprintf(nifname, "netmap:%s-%d", ifname, ctxt->cpu); + + TRACE_INFO("Opening %s with j: %d (cpu: %d)\n", nifname, j, ctxt->cpu); + + struct nmreq base_nmd; + memset(&base_nmd, 0, sizeof(base_nmd)); + base_nmd.nr_arg3 = EXTRA_BUFS; + + npc->local_nmd[j] = nm_open(nifname, &base_nmd, 0, NULL); + if (npc->local_nmd[j] == NULL) { + TRACE_ERROR("Unable to open %s: %s\n", nifname, strerror(errno)); + exit(EXIT_FAILURE); + } + } +} +/*----------------------------------------------------------------------------*/ +int netmap_link_devices(struct mtcp_thread_context *ctxt) +{ + /* linking takes place during mtcp_init() */ + + return 0; +} +/*----------------------------------------------------------------------------*/ +void netmap_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len) +{ + /* + * do nothing over here - memory reclamation + * will take place in dpdk_recv_pkts + */ +} +/*----------------------------------------------------------------------------*/ +int netmap_send_pkts(struct mtcp_thread_context *ctxt, int nif) +{ + int pkt_size, idx; + struct netmap_private_context *npc; + mtcp_manager_t mtcp; + + npc = (struct netmap_private_context *)ctxt->io_private_context; + idx = nif; + pkt_size = npc->snd_pkt_size[idx]; + mtcp = ctxt->mtcp_manager; + + /* assert-type statement */ + if (pkt_size == 0) + return 0; + +#ifdef NETSTAT + mtcp->nstat.tx_packets[nif]++; + mtcp->nstat.tx_bytes[nif] += pkt_size + ETHER_OVR; +#endif + +tx_again: + if (nm_inject(npc->local_nmd[idx], npc->snd_pktbuf[idx], pkt_size) == 0) { + TRACE_DBG("Failed to send pkt of size %d on interface: %d\n", pkt_size, idx); + + ioctl(npc->local_nmd[idx]->fd, NIOCTXSYNC, NULL); + goto tx_again; + } + +#ifdef NETSTAT + // mtcp->nstat.rx_errors[idx]++; +#endif + npc->snd_pkt_size[idx] = 0; + + return 1; +} +/*----------------------------------------------------------------------------*/ +uint8_t *netmap_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize) +{ + struct netmap_private_context *npc; + int idx = nif; + + npc = (struct netmap_private_context *)ctxt->io_private_context; + if (npc->snd_pkt_size[idx] != 0) + netmap_send_pkts(ctxt, nif); + + npc->snd_pkt_size[idx] = pktsize; + + return (uint8_t *)npc->snd_pktbuf[idx]; +} +/*----------------------------------------------------------------------------*/ +int32_t netmap_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + struct netmap_private_context *npc; + struct nm_desc *d; + npc = (struct netmap_private_context *)ctxt->io_private_context; + d = npc->local_nmd[ifidx]; + + int p = 0; + int c, got = 0, ri = d->cur_rx_ring; + int n = d->last_rx_ring - d->first_rx_ring + 1; + int cnt = MAX_PKT_BURST; + + for (c = 0; c < n && cnt != got && npc->dev_poll_flag[ifidx]; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + + ri = d->cur_rx_ring + c; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + ring = NETMAP_RXRING(d->nifp, ri); + for (; !nm_ring_empty(ring) && cnt != got; got++) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + npc->rcv_pktbuf[p] = (u_char *)NETMAP_BUF(ring, idx); + npc->rcv_pkt_len[p] = ring->slot[i].len; + p++; + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + d->cur_rx_ring = ri; + + npc->dev_poll_flag[ifidx] = 0; + + return p; +} +/*----------------------------------------------------------------------------*/ +uint8_t *netmap_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len) +{ + struct netmap_private_context *npc; + npc = (struct netmap_private_context *)ctxt->io_private_context; + + *len = npc->rcv_pkt_len[index]; + return (unsigned char *)npc->rcv_pktbuf[index]; +} +/*----------------------------------------------------------------------------*/ +int32_t netmap_select(struct mtcp_thread_context *ctxt) +{ + int i, rc; + struct pollfd pfd[MAX_DEVICES]; + struct netmap_private_context *npc = (struct netmap_private_context *)ctxt->io_private_context; + + /* see if num_devices have been registered */ + if (npc->local_nmd[0] == NULL) + return -1; + + for (i = 0; i < num_devices_attached; i++) { + pfd[i].fd = npc->local_nmd[i]->fd; + pfd[i].events = POLLIN; + } + +#ifndef CONST_POLLING + if (npc->idle_poll_count >= IDLE_POLL_COUNT) { + rc = poll(pfd, num_devices_attached, IDLE_POLL_WAIT); + } else +#endif + { + rc = poll(pfd, num_devices_attached, 0); + } + + npc->idle_poll_count = (rc == 0) ? (npc->idle_poll_count + 1) : 0; + + for (i = 0; rc > 0 && i < num_devices_attached; i++) + if (!(pfd[i].revents & (POLLERR))) + npc->dev_poll_flag[i] = 1; + return 0; +} +/*----------------------------------------------------------------------------*/ +void netmap_destroy_handle(struct mtcp_thread_context *ctxt) +{ +} +/*----------------------------------------------------------------------------*/ +void netmap_load_module(void) +{ + /* not needed - all initializations done in netmap_init_handle() */ +} +/*----------------------------------------------------------------------------*/ +io_module_func netmap_module_func = { .load_module = netmap_load_module, + .init_handle = netmap_init_handle, + .link_devices = netmap_link_devices, + .release_pkt = netmap_release_pkt, + .send_pkts = netmap_send_pkts, + .get_wptr = netmap_get_wptr, + .recv_pkts = netmap_recv_pkts, + .get_rptr = netmap_get_rptr, + .select = netmap_select, + .destroy_handle = netmap_destroy_handle, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#else +io_module_func netmap_module_func = { .load_module = NULL, + .init_handle = NULL, + .link_devices = NULL, + .release_pkt = NULL, + .send_pkts = NULL, + .get_wptr = NULL, + .recv_pkts = NULL, + .get_rptr = NULL, + .select = NULL, + .destroy_handle = NULL, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#endif /* !DISABLE_NETMAP */ diff --git a/lib/flash/mtcp/onvm_module.c b/lib/flash/mtcp/onvm_module.c new file mode 100644 index 0000000..a09eb1a --- /dev/null +++ b/lib/flash/mtcp/onvm_module.c @@ -0,0 +1,579 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* for io_module_func def'ns */ +#include "io_module.h" +#ifdef ENABLE_ONVM +/* for mtcp related def'ns */ +#include "mtcp.h" +/* for errno */ +#include +/* for logging */ +#include "debug.h" +/* for num_devices_* */ +#include "config.h" +/* for rte_max_eth_ports */ +#include +/* for rte_eth_rxconf */ +#include +/* for delay funcs */ +#include +#include +#define ENABLE_STATS_IOCTL 1 +#ifdef ENABLE_STATS_IOCTL +/* for close */ +#include +/* for open */ +#include +/* for ioctl */ +#include +#endif /* !ENABLE_STATS_IOCTL */ +/* for ip pseudo-chksum */ +#include + +/* for onvm rings */ +#include +#include + +/*----------------------------------------------------------------------------*/ +/* Essential macros */ +//#define MAX_RX_QUEUE_PER_LCORE MAX_CPUS +//#define MAX_TX_QUEUE_PER_PORT MAX_CPUS +#define PKTMBUF_POOL_NAME "MProc_pktmbuf_pool" + +#ifdef ENABLELRO +#define MBUF_SIZE (16384 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) +#else +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) +#endif /* !ENABLELRO */ +#define NB_MBUF 8192 +#define MEMPOOL_CACHE_SIZE 256 +//#define RX_IDLE_ENABLE 1 +#define RX_IDLE_TIMEOUT 1 /* in micro-seconds */ +#define RX_IDLE_THRESH 64 +#define MAX_PKT_BURST ((uint16_t)32) /*64*/ /*128*/ + +/* + * Configurable number of RX/TX ring descriptors + */ +//#define RTE_TEST_RX_DESC_DEFAULT 128 +//#define RTE_TEST_TX_DESC_DEFAULT 128 + +/*----------------------------------------------------------------------------*/ +/* packet memory pool for storing packet bufs */ +static struct rte_mempool *pktmbuf_pool = NULL; + +//#define DEBUG 1 +#ifdef DEBUG +/* ethernet addresses of ports */ +static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; +#endif + +static struct rte_eth_dev_info dev_info[RTE_MAX_ETHPORTS]; + +struct mbuf_table { + unsigned len; /* length of queued packets */ + struct rte_mbuf *m_table[MAX_PKT_BURST]; +}; + +struct dpdk_private_context { + struct mbuf_table rmbufs[RTE_MAX_ETHPORTS]; + struct mbuf_table wmbufs[RTE_MAX_ETHPORTS]; + struct rte_mempool *pktmbuf_pool; + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; +#ifdef RX_IDLE_ENABLE + uint8_t rx_idle; +#endif +#ifdef ENABLELRO + struct rte_mbuf *cur_rx_m; +#endif +#ifdef ENABLE_STATS_IOCTL + int fd; + uint32_t cur_ts; +#endif /* !ENABLE_STATS_IOCTL */ +} __rte_cache_aligned; + +/* onvm structs */ +struct onvm_nf_info *nf_info; +struct rte_ring *rx_ring; +struct rte_ring *tx_ring; +volatile struct onvm_nf *nf; + +#ifdef ENABLE_STATS_IOCTL +/** + * stats struct passed on from user space to the driver + */ +struct stats_struct { + uint64_t tx_bytes; + uint64_t tx_pkts; + uint64_t rx_bytes; + uint64_t rx_pkts; + uint64_t rmiss; + uint64_t rerr; + uint64_t terr; + uint8_t qid; + uint8_t dev; +}; +#endif /* !ENABLE_STATS_IOCTL */ +/*----------------------------------------------------------------------------*/ +void onvm_init_handle(struct mtcp_thread_context *ctxt) +{ + struct dpdk_private_context *dpc; + int i, j; + char mempool_name[20]; + + /* create and initialize private I/O module context */ + ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context)); + if (ctxt->io_private_context == NULL) { + TRACE_ERROR("Failed to initialize ctxt->io_private_context: " + "Can't allocate memory\n"); + exit(EXIT_FAILURE); + } + + sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu); + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + dpc->pktmbuf_pool = pktmbuf_pool; + + /* Complete onvm handshake */ + onvm_nflib_nf_ready(CONFIG.nf_local_ctx->nf); + + /* Initialize onvm rings*/ + nf = CONFIG.nf_local_ctx->nf; + rx_ring = nf->rx_q; + tx_ring = nf->tx_q; + + /* set wmbufs correctly */ + for (j = 0; j < num_devices_attached; j++) { + /* Allocate wmbufs for each registered port */ + for (i = 0; i < MAX_PKT_BURST; i++) { + dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool); + if (dpc->wmbufs[j].m_table[i] == NULL) { + TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n", ctxt->cpu, i, j); + exit(EXIT_FAILURE); + } + } + /* set mbufs queue length to 0 to begin with */ + dpc->wmbufs[j].len = 0; + } + +#ifdef ENABLE_STATS_IOCTL + dpc->fd = open("/dev/dpdk-iface", O_RDWR); + if (dpc->fd == -1) { + TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d! " + "Are you using mlx4/mlx5 driver?\n", + ctxt->cpu); + } +#endif /* !ENABLE_STATS_IOCTL */ +} +/*----------------------------------------------------------------------------*/ +int onvm_link_devices(struct mtcp_thread_context *ctxt) +{ + /* linking takes place during mtcp_init() */ + + return 0; +} +/*----------------------------------------------------------------------------*/ +void onvm_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len) +{ + /* + * do nothing over here - memory reclamation + * will take place in onvm_recv_pkts + */ +} +/*----------------------------------------------------------------------------*/ +int onvm_send_pkts(struct mtcp_thread_context *ctxt, int nif) +{ + struct dpdk_private_context *dpc; + mtcp_manager_t mtcp; + int ret, i; + struct onvm_pkt_meta *meta; + struct onvm_ft_ipv4_5tuple key; + int ifidx; + + ifidx = nif; + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + mtcp = ctxt->mtcp_manager; + ret = 0; + + /* if there are packets in the queue... flush them out to the wire */ + if (dpc->wmbufs[nif].len > /*= MAX_PKT_BURST*/ 0) { + struct rte_mbuf **pkts; +#ifdef ENABLE_STATS_IOCTL + struct rte_eth_stats stats; + struct stats_struct ss; +#endif /* !ENABLE_STATS_IOCTL */ + int cnt = dpc->wmbufs[nif].len; + pkts = dpc->wmbufs[nif].m_table; +#ifdef NETSTAT + mtcp->nstat.tx_packets[nif] += cnt; +#ifdef ENABLE_STATS_IOCTL + /* only pass stats after >= 1 sec interval */ + if (abs(mtcp->cur_ts - dpc->cur_ts) >= 1000 && likely(dpc->fd >= 0)) { + /* rte_get_stats is global func, use only for 1 core */ + if (ctxt->cpu == 0) { + rte_eth_stats_get(CONFIG.eths[ifidx].ifindex, &stats); + ss.rmiss = stats.imissed; + ss.rerr = stats.ierrors; + ss.terr = stats.oerrors; + } else + ss.rmiss = ss.rerr = ss.terr = 0; + + ss.tx_pkts = mtcp->nstat.tx_packets[ifidx]; + ss.tx_bytes = mtcp->nstat.tx_bytes[ifidx]; + ss.rx_pkts = mtcp->nstat.rx_packets[ifidx]; + ss.rx_bytes = mtcp->nstat.rx_bytes[ifidx]; + ss.qid = ctxt->cpu; + ss.dev = CONFIG.eths[ifidx].ifindex; + /* pass the info now */ + ioctl(dpc->fd, 0, &ss); + dpc->cur_ts = mtcp->cur_ts; + if (ctxt->cpu == 0) + rte_eth_stats_reset(CONFIG.eths[ifidx].ifindex); + } +#endif /* !ENABLE_STATS_IOCTL */ +#endif + + for (i = 0; i < cnt; i++) { + meta = onvm_get_pkt_meta(pkts[i]); + if (CONFIG.onvm_dest == (uint16_t)-1) { + meta->action = ONVM_NF_ACTION_OUT; + meta->destination = CONFIG.eths[nif].ifindex; + } else { + onvm_ft_fill_key(&key, pkts[i]); + pkts[i]->hash.rss = onvm_softrss(&key); + meta->action = ONVM_NF_ACTION_TONF; + meta->destination = CONFIG.onvm_dest; + } + } + ret = rte_ring_enqueue_bulk(tx_ring, (void *const *)pkts, cnt, NULL); + if (cnt > 0 && ret == 0) { + TRACE_ERROR("Dropped %d packets", cnt); + nf->stats.tx_drop += cnt; + } + nf->stats.tx += cnt; + + /* time to allocate fresh mbufs for the queue */ + for (i = 0; i < dpc->wmbufs[nif].len; i++) { + dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool); + /* error checking */ + if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) { + TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n", ctxt->cpu, i, nif); + exit(EXIT_FAILURE); + } + } + /* reset the len of mbufs var after flushing of packets */ + dpc->wmbufs[nif].len = 0; + } + + return ret; +} +/*----------------------------------------------------------------------------*/ +uint8_t *onvm_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize) +{ + struct dpdk_private_context *dpc; + mtcp_manager_t mtcp; + struct rte_mbuf *m; + uint8_t *ptr; + int len_of_mbuf; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + mtcp = ctxt->mtcp_manager; + + /* sanity check */ + if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST)) + return NULL; + + len_of_mbuf = dpc->wmbufs[nif].len; + m = dpc->wmbufs[nif].m_table[len_of_mbuf]; + + /* retrieve the right write offset */ + ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *); + m->pkt_len = m->data_len = pktsize; + m->nb_segs = 1; + m->next = NULL; + +#ifdef NETSTAT + mtcp->nstat.tx_bytes[nif] += pktsize + 24; +#endif + + /* increment the len_of_mbuf var */ + dpc->wmbufs[nif].len = len_of_mbuf + 1; + + return (uint8_t *)ptr; +} +/*----------------------------------------------------------------------------*/ +static inline void free_pkts(struct rte_mbuf **mtable, unsigned len) +{ + int i; + + /* free the freaking packets */ + for (i = 0; i < len; i++) { + rte_pktmbuf_free(mtable[i]); + RTE_MBUF_PREFETCH_TO_FREE(mtable[i + 1]); + } +} +/*----------------------------------------------------------------------------*/ +int32_t onvm_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + struct dpdk_private_context *dpc; + int ret; + void *pkts[MAX_PKT_BURST]; + int i; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + if (dpc->rmbufs[ifidx].len != 0) { + free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len); + dpc->rmbufs[ifidx].len = 0; + } + + ret = rte_ring_dequeue_burst(rx_ring, pkts, MAX_PKT_BURST, NULL); + + for (i = 0; i < ret; i++) { + dpc->pkts_burst[i] = (struct rte_mbuf *)pkts[i]; + } + +#ifdef RX_IDLE_ENABLE + dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1; +#endif + dpc->rmbufs[ifidx].len = ret; + + return ret; +} +/*----------------------------------------------------------------------------*/ +uint8_t *onvm_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len) +{ + struct dpdk_private_context *dpc; + struct rte_mbuf *m; + uint8_t *pktbuf; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + m = dpc->pkts_burst[index]; + //rte_prefetch0(rte_pktmbuf_mtod(m, void *)); + *len = m->pkt_len; + pktbuf = rte_pktmbuf_mtod(m, uint8_t *); + + /* enqueue the pkt ptr in mbuf */ + dpc->rmbufs[ifidx].m_table[index] = m; + + /* verify checksum values from ol_flags */ + if ((m->ol_flags & (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD)) != 0) { + TRACE_ERROR("%s(%p, %d, %d): mbuf with invalid checksum: " + "%p(%lu);\n", + __func__, ctxt, ifidx, index, m, m->ol_flags); + pktbuf = NULL; + } +#ifdef ENABLELRO + dpc->cur_rx_m = m; +#endif /* ENABLELRO */ + + return pktbuf; +} +/*----------------------------------------------------------------------------*/ +int32_t onvm_select(struct mtcp_thread_context *ctxt) +{ +#ifdef RX_IDLE_ENABLE + struct dpdk_private_context *dpc; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + if (dpc->rx_idle > RX_IDLE_THRESH) { + dpc->rx_idle = 0; + usleep(RX_IDLE_TIMEOUT); + } +#endif + return 0; +} +/*----------------------------------------------------------------------------*/ +void onvm_destroy_handle(struct mtcp_thread_context *ctxt) +{ + struct dpdk_private_context *dpc; + int i; + + dpc = (struct dpdk_private_context *)ctxt->io_private_context; + + /* free wmbufs */ + for (i = 0; i < num_devices_attached; i++) + free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST); + +#ifdef ENABLE_STATS_IOCTL + /* free fd */ + if (dpc->fd >= 0) + close(dpc->fd); +#endif /* !ENABLE_STATS_IOCTL */ + + /* free it all up */ + free(dpc); +} +/*----------------------------------------------------------------------------*/ +void onvm_load_module(void) +{ + int i, portid; + + pktmbuf_pool = rte_mempool_lookup(PKTMBUF_POOL_NAME); + if (pktmbuf_pool == NULL) { + rte_exit(EXIT_FAILURE, "Cannot init mbuf pool, errno: %d\n", rte_errno); + } + + for (i = 0; i < num_devices_attached; ++i) { + /* get portid form the index of attached devices */ + portid = devices_attached[i]; + /* check port capabilities */ + rte_eth_dev_info_get(portid, &dev_info[portid]); + } +} +/*----------------------------------------------------------------------------*/ +int32_t onvm_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp) +{ + struct dpdk_private_context *dpc; + struct rte_mbuf *m; + int len_of_mbuf; + struct iphdr *iph; + struct tcphdr *tcph; + void **argpptr = (void **)argp; +#ifdef ENABLELRO + uint8_t *payload, *to; + int seg_off; +#endif + + if (cmd == DRV_NAME) { + *argpptr = (void *)dev_info[nif].driver_name; + return 0; + } + + int eidx = CONFIG.nif_to_eidx[nif]; + + iph = (struct iphdr *)argp; + dpc = (struct dpdk_private_context *)ctx->io_private_context; + len_of_mbuf = dpc->wmbufs[eidx].len; + + switch (cmd) { + case PKT_TX_IP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; + m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4; + m->l2_len = sizeof(struct ether_hdr); + m->l3_len = (iph->ihl << 2); + break; + case PKT_TX_TCP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; + tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl << 2)); + m->ol_flags |= PKT_TX_TCP_CKSUM; + tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags); + break; +#ifdef ENABLELRO + case PKT_RX_TCP_LROSEG: + m = dpc->cur_rx_m; + //if (m->next != NULL) + // rte_prefetch0(rte_pktmbuf_mtod(m->next, void *)); + iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr)); + tcph = (struct tcphdr *)((u_char *)iph + (iph->ihl << 2)); + payload = (uint8_t *)tcph + (tcph->doff << 2); + + seg_off = m->data_len - sizeof(struct ether_hdr) - (iph->ihl << 2) - (tcph->doff << 2); + + to = (uint8_t *)argp; + m = m->next; + memcpy(to, payload, seg_off); + while (m != NULL) { + //if (m->next != NULL) + // rte_prefetch0(rte_pktmbuf_mtod(m->next, void *)); + memcpy(to + seg_off, rte_pktmbuf_mtod(m, uint8_t *), m->data_len); + seg_off += m->data_len; + m = m->next; + } + break; +#endif + case PKT_TX_TCPIP_CSUM: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1]; + iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr)); + tcph = (struct tcphdr *)((uint8_t *)iph + (iph->ihl << 2)); + m->l2_len = sizeof(struct ether_hdr); + m->l3_len = (iph->ihl << 2); + m->l4_len = (tcph->doff << 2); + m->ol_flags = PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4; + tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags); + break; + case PKT_RX_IP_CSUM: + if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + break; + case PKT_RX_TCP_CSUM: + if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + break; + case PKT_TX_TCPIP_CSUM_PEEK: + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0) + goto dev_ioctl_err; + if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0) + goto dev_ioctl_err; + break; + default: + goto dev_ioctl_err; + } + return 0; +dev_ioctl_err: + return -1; +} +/*----------------------------------------------------------------------------*/ +io_module_func onvm_module_func = { .load_module = onvm_load_module, + .init_handle = onvm_init_handle, + .link_devices = onvm_link_devices, + .release_pkt = onvm_release_pkt, + .send_pkts = onvm_send_pkts, + .get_wptr = onvm_get_wptr, + .recv_pkts = onvm_recv_pkts, + .get_rptr = onvm_get_rptr, + .select = onvm_select, + .destroy_handle = onvm_destroy_handle, + .dev_ioctl = onvm_dev_ioctl }; +/*----------------------------------------------------------------------------*/ +#else +io_module_func onvm_module_func = { .load_module = NULL, + .init_handle = NULL, + .link_devices = NULL, + .release_pkt = NULL, + .send_pkts = NULL, + .get_wptr = NULL, + .recv_pkts = NULL, + .get_rptr = NULL, + .select = NULL, + .destroy_handle = NULL, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#endif /* ENABLE_ONVM */ diff --git a/lib/flash/mtcp/pacing.c b/lib/flash/mtcp/pacing.c new file mode 100644 index 0000000..c09929b --- /dev/null +++ b/lib/flash/mtcp/pacing.c @@ -0,0 +1,127 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pacing.h" +#include "clock.h" +#include "tcp_util.h" +/*----------------------------------------------------------------------------*/ +#if RATE_LIMIT_ENABLED +token_bucket *NewTokenBucket() +{ + token_bucket *bucket; + bucket = malloc(sizeof(token_bucket)); + if (bucket == NULL) + return NULL; + + bucket->rate = 0; + bucket->burst = (MSS * INIT_CWND_PKTS); + bucket->tokens = bucket->burst; + bucket->last_fill_t = now_usecs(); + return bucket; +} +/*----------------------------------------------------------------------------*/ +void _refill_bucket(token_bucket *bucket) +{ + uint32_t elapsed = time_since_usecs(bucket->last_fill_t); + double new_tokens = SECONDS_TO_USECS(bucket->rate * elapsed); + double prev_tokens = bucket->tokens; + bucket->tokens = MIN(bucket->burst, bucket->tokens + new_tokens); + if (bucket->tokens > prev_tokens) { + bucket->last_fill_t = now_usecs(); + } else { + //fprintf(stderr, "elapsed=%lu new=%f\n", time_since_usecs(bucket->last_fill_t), new_tokens); + } +} +/*----------------------------------------------------------------------------*/ +int SufficientTokens(token_bucket *bucket, uint64_t new_bits) +{ + double new_bytes = BITS_TO_BYTES(new_bits); + + //fprintf(stderr, "checking for %ld tokens\n", new_bits); + + _refill_bucket(bucket); + + if (bucket->tokens >= new_bytes) { + bucket->tokens -= new_bytes; + return 0; + } + + return -1; +} +/*----------------------------------------------------------------------------*/ +void PrintBucket(token_bucket *bucket) +{ + fprintf(stderr, "[rate=%.3f tokens=%f last=%u]\n", BPS_TO_MBPS(bucket->rate), bucket->tokens, bucket->last_fill_t); +} +/*----------------------------------------------------------------------------*/ +#endif /* !RATE_LIMIT_ENABLED */ + +#if PACING_ENABLED +/*----------------------------------------------------------------------------*/ +packet_pacer *NewPacketPacer() +{ + packet_pacer *pacer; + pacer = malloc(sizeof(packet_pacer)); + if (pacer == NULL) + return NULL; + pacer->rate_bps = 0; + pacer->extra_packets = 1; + pacer->next_send_time = 0; + return pacer; +} +/*----------------------------------------------------------------------------*/ +int CanSendNow(packet_pacer *pacer) +{ + if (pacer->rate_bps == 0) { + return TRUE; + } + + uint32_t now = now_usecs(); + if (now >= pacer->next_send_time) { + pacer->next_send_time = now + (int)(MSS / BPS_TO_MBPS(pacer->rate_bps)); + pacer->extra_packets = 1; + //fprintf(stderr, "now=%u, next=%u\n", now, pacer->next_send_time); + + return TRUE; + } else if (pacer->extra_packets) { + pacer->extra_packets--; + return TRUE; + } else { + return FALSE; + } +} +/*----------------------------------------------------------------------------*/ +void PrintPacer(packet_pacer *pacer) +{ + //fprintf(stderr, "[rate=%u next_time=%u]\n", pacer->rate_bps, pacer->next_send_time); +} +/*----------------------------------------------------------------------------*/ +#endif /* !PACING_ENABLED */ diff --git a/lib/flash/mtcp/pipe.c b/lib/flash/mtcp/pipe.c new file mode 100644 index 0000000..d10f162 --- /dev/null +++ b/lib/flash/mtcp/pipe.c @@ -0,0 +1,437 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "pipe.h" +#include "eventpoll.h" +#include "tcp_stream.h" +#include "mtcp.h" +#include "debug.h" + +#define PIPE_BUF_SIZE 10240 + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/*---------------------------------------------------------------------------*/ +enum pipe_state { + PIPE_CLOSED, + PIPE_ACTIVE, + PIPE_CLOSE_WAIT, +}; +/*---------------------------------------------------------------------------*/ +struct pipe { + int state; + socket_map_t socket[2]; + + char *buf; + int buf_off; + int buf_tail; + int buf_len; + int buf_size; + + pthread_mutex_t pipe_lock; + pthread_cond_t pipe_cond; +}; +/*---------------------------------------------------------------------------*/ +int mtcp_pipe(mctx_t mctx, int pipeid[2]) +{ + socket_map_t socket[2]; + struct pipe *pp; + int ret; + + socket[0] = AllocateSocket(mctx, MTCP_SOCK_PIPE, FALSE); + if (!socket[0]) { + errno = ENFILE; + return -1; + } + socket[1] = AllocateSocket(mctx, MTCP_SOCK_PIPE, FALSE); + if (!socket[1]) { + FreeSocket(mctx, socket[0]->id, FALSE); + errno = ENFILE; + return -1; + } + + pp = (struct pipe *)calloc(1, sizeof(struct pipe)); + if (!pp) { + /* errno set by calloc() */ + FreeSocket(mctx, socket[0]->id, FALSE); + FreeSocket(mctx, socket[1]->id, FALSE); + return -1; + } + + pp->buf_size = PIPE_BUF_SIZE; + pp->buf = (char *)malloc(pp->buf_size); + if (!pp->buf) { + /* errno set by malloc() */ + FreeSocket(mctx, socket[0]->id, FALSE); + FreeSocket(mctx, socket[1]->id, FALSE); + free(pp); + return -1; + } + + ret = pthread_mutex_init(&pp->pipe_lock, NULL); + if (ret) { + /* errno set by pthread_mutex_init() */ + FreeSocket(mctx, socket[0]->id, FALSE); + FreeSocket(mctx, socket[1]->id, FALSE); + free(pp->buf); + free(pp); + return -1; + } + ret = pthread_cond_init(&pp->pipe_cond, NULL); + if (ret) { + /* errno set by pthread_cond_init() */ + FreeSocket(mctx, socket[0]->id, FALSE); + FreeSocket(mctx, socket[1]->id, FALSE); + free(pp->buf); + pthread_mutex_destroy(&pp->pipe_lock); + free(pp); + return -1; + } + + pp->state = PIPE_ACTIVE; + pp->socket[0] = socket[0]; + pp->socket[1] = socket[1]; + socket[0]->pp = pp; + socket[1]->pp = pp; + + pipeid[0] = socket[0]->id; + pipeid[1] = socket[1]->id; + + return 0; +} +/*---------------------------------------------------------------------------*/ +static void RaiseEventToPair(mtcp_manager_t mtcp, socket_map_t socket, uint32_t event) +{ + struct pipe *pp = socket->pp; + socket_map_t pair_socket; + + if (pp->socket[0] == socket) + pair_socket = pp->socket[1]; + else + pair_socket = pp->socket[0]; + + if (pair_socket->opts & MTCP_NONBLOCK) { + if (pair_socket->epoll) { + AddEpollEvent(mtcp->ep, USR_EVENT_QUEUE, pair_socket, event); + } + } else { + pthread_cond_signal(&pp->pipe_cond); + } +} +/*---------------------------------------------------------------------------*/ +int PipeRead(mctx_t mctx, int pipeid, char *buf, int len) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + struct pipe *pp; + int to_read; + int to_notify; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + socket = GetSocket(mctx, pipeid); + if (!socket) { + return -1; + } + if (socket->socktype != MTCP_SOCK_PIPE) { + errno = EBADF; + return -1; + } + pp = socket->pp; + if (!pp) { + errno = EBADF; + return -1; + } + if (pp->state == PIPE_CLOSED) { + errno = EINVAL; + return -1; + } + if (pp->state == PIPE_CLOSE_WAIT && pp->buf_len == 0) { + return 0; + } + + if (len <= 0) { + if (socket->opts & MTCP_NONBLOCK) { + errno = EAGAIN; + return -1; + } else { + return 0; + } + } + + pthread_mutex_lock(&pp->pipe_lock); + if (!(socket->opts & MTCP_NONBLOCK)) { + while (pp->buf_len == 0) { + ret = pthread_cond_wait(&pp->pipe_cond, &pp->pipe_lock); + if (ret) { + /* errno set by pthread_cond_wait() */ + pthread_mutex_unlock(&pp->pipe_lock); + return -1; + } + } + } + + to_read = MIN(len, pp->buf_len); + if (to_read <= 0) { + pthread_mutex_unlock(&pp->pipe_lock); + if (pp->state == PIPE_ACTIVE) { + errno = EAGAIN; + return -1; + } else if (pp->state == PIPE_CLOSE_WAIT) { + return 0; + } + } + + /* if the buffer was full, notify the write event to the pair socket */ + to_notify = FALSE; + if (pp->buf_len == pp->buf_size) + to_notify = TRUE; + + if (pp->buf_off + to_read < pp->buf_size) { + memcpy(buf, pp->buf + pp->buf_off, to_read); + pp->buf_off += to_read; + } else { + int temp_read = pp->buf_size - pp->buf_off; + memcpy(buf, pp->buf + pp->buf_off, temp_read); + memcpy(buf + temp_read, pp->buf, to_read - temp_read); + pp->buf_off = to_read - temp_read; + } + pp->buf_len -= to_read; + + /* notify to the pair socket for new buffer space */ + if (to_notify) { + RaiseEventToPair(mtcp, socket, MTCP_EPOLLOUT); + } + + pthread_mutex_unlock(&pp->pipe_lock); + + /* if level triggered, raise event for remainig buffer */ + if (pp->buf_len > 0) { + if ((socket->epoll & MTCP_EPOLLIN) && !(socket->epoll & MTCP_EPOLLET)) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } + } else if (pp->state == PIPE_CLOSE_WAIT && pp->buf_len == 0) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } + + return to_read; +} +/*---------------------------------------------------------------------------*/ +int PipeWrite(mctx_t mctx, int pipeid, const char *buf, int len) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + struct pipe *pp; + int to_write; + int to_notify; + int ret; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + socket = GetSocket(mctx, pipeid); + if (!socket) { + return -1; + } + if (socket->socktype != MTCP_SOCK_PIPE) { + errno = EBADF; + return -1; + } + pp = socket->pp; + if (!pp) { + errno = EBADF; + return -1; + } + if (pp->state == PIPE_CLOSED) { + errno = EINVAL; + return -1; + } + if (pp->state == PIPE_CLOSE_WAIT) { + errno = EPIPE; + return -1; + } + + if (len <= 0) { + if (socket->opts & MTCP_NONBLOCK) { + errno = EAGAIN; + return -1; + } else { + return 0; + } + } + + pthread_mutex_lock(&pp->pipe_lock); + if (!(socket->opts & MTCP_NONBLOCK)) { + while (pp->buf_len == pp->buf_size) { + ret = pthread_cond_wait(&pp->pipe_cond, &pp->pipe_lock); + if (ret) { + /* errno set by pthread_cond_wait() */ + pthread_mutex_unlock(&pp->pipe_lock); + return -1; + } + } + } + + to_write = MIN(len, pp->buf_size - pp->buf_len); + if (to_write <= 0) { + pthread_mutex_unlock(&pp->pipe_lock); + errno = EAGAIN; + return -1; + } + + /* if the buffer was empty, notify read event to the pair socket */ + to_notify = FALSE; + if (pp->buf_len == 0) + to_notify = TRUE; + + if (pp->buf_tail + to_write < pp->buf_size) { + /* if the data fit into the buffer, copy it */ + memcpy(pp->buf + pp->buf_tail, buf, to_write); + pp->buf_tail += to_write; + } else { + /* if the data overflow the buffer, wrap around the buffer */ + int temp_write = pp->buf_size - pp->buf_tail; + memcpy(pp->buf + pp->buf_tail, buf, temp_write); + memcpy(pp->buf, buf + temp_write, to_write - temp_write); + pp->buf_tail = to_write - temp_write; + } + pp->buf_len += to_write; + + /* notify to the pair socket for the new buffers */ + if (to_notify) { + RaiseEventToPair(mtcp, socket, MTCP_EPOLLIN); + } + + pthread_mutex_unlock(&pp->pipe_lock); + + /* if level triggered, raise event for remainig buffer */ + if (pp->buf_len < pp->buf_size) { + if ((socket->epoll & MTCP_EPOLLOUT) && !(socket->epoll & MTCP_EPOLLET)) { + AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT); + } + } + + return to_write; +} +/*----------------------------------------------------------------------------*/ +int RaisePendingPipeEvents(mctx_t mctx, int epid, int pipeid) +{ + struct mtcp_epoll *ep = GetSocket(mctx, epid)->ep; + socket_map_t socket = GetSocket(mctx, pipeid); + struct pipe *pp = socket->pp; + + if (!pp) + return -1; + if (pp->state < PIPE_ACTIVE) + return -1; + + /* if there are payloads already read before epoll registration */ + /* generate read event */ + if (socket->epoll & MTCP_EPOLLIN) { + if (pp->buf_len > 0) { + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } else if (pp->state == PIPE_CLOSE_WAIT) { + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN); + } + } + + /* same thing to the write event */ + if (socket->epoll & MTCP_EPOLLOUT) { + if (pp->buf_len < pp->buf_size) { + AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT); + } + } + + return 0; +} +/*---------------------------------------------------------------------------*/ +int PipeClose(mctx_t mctx, int pipeid) +{ + mtcp_manager_t mtcp; + socket_map_t socket; + struct pipe *pp; + + mtcp = GetMTCPManager(mctx); + if (!mtcp) { + return -1; + } + socket = GetSocket(mctx, pipeid); + if (!socket) { + return -1; + } + if (socket->socktype != MTCP_SOCK_PIPE) { + errno = EINVAL; + return -1; + } + pp = socket->pp; + if (!pp) { + return 0; + } + + if (pp->state == PIPE_CLOSED) { + return 0; + } + + pthread_mutex_lock(&pp->pipe_lock); + if (pp->state == PIPE_ACTIVE) { + pp->state = PIPE_CLOSE_WAIT; + RaiseEventToPair(mtcp, socket, MTCP_EPOLLIN); + pthread_mutex_unlock(&pp->pipe_lock); + return 0; + } + + /* control reaches here only when PIPE_CLOSE_WAIT */ + + if (pp->socket[0]) + pp->socket[0]->pp = NULL; + if (pp->socket[1]) + pp->socket[1]->pp = NULL; + + pthread_mutex_unlock(&pp->pipe_lock); + + pthread_mutex_destroy(&pp->pipe_lock); + pthread_cond_destroy(&pp->pipe_cond); + + free(pp->buf); + + free(pp); + + return 0; +} +/*---------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/psio_module.c b/lib/flash/mtcp/psio_module.c new file mode 100644 index 0000000..8256a78 --- /dev/null +++ b/lib/flash/mtcp/psio_module.c @@ -0,0 +1,426 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* for io_module_func def'ns */ +#include "io_module.h" +#ifndef DISABLE_PSIO +/* for mtcp related def'ns */ +#include "mtcp.h" +/* for psio related def'ns */ +#include "ps.h" +/* for errno */ +#include +/* for logging */ +#include "debug.h" +/* for num_devices_* */ +#include "config.h" +/* for ETHER_CRC_LEN */ +#include +/*----------------------------------------------------------------------------*/ +#define PS_CHUNK_SIZE 64 +#define PS_SELECT_TIMEOUT 100 /* in us */ + +/* + * Ethernet frame overhead + */ + +#define ETHER_IFG 12 +#define ETHER_PREAMBLE 8 +#define ETHER_OVR (ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG) +/*----------------------------------------------------------------------------*/ +struct ps_device devices[MAX_DEVICES]; +/*----------------------------------------------------------------------------*/ +struct psio_private_context { + struct ps_handle handle; + struct ps_chunk_buf w_chunk_buf[ETH_NUM]; + struct ps_chunk chunk; + struct ps_event event; + + nids_set rx_avail; + nids_set tx_avail; + struct timeval last_tx_set[ETH_NUM]; +} __attribute__((aligned(__WORDSIZE))); +/*----------------------------------------------------------------------------*/ +void psio_init_handle(struct mtcp_thread_context *ctxt) +{ + int i, ret; + struct psio_private_context *ppc; + struct timeval cur_ts; + + /* create and initialize private I/O module context */ + ctxt->io_private_context = calloc(1, sizeof(struct psio_private_context)); + if (ctxt->io_private_context == NULL) { + TRACE_ERROR("Failed to initialize ctxt->io_private_context: " + "Can't allocate memory\n"); + exit(EXIT_FAILURE); + } + + ppc = (struct psio_private_context *)ctxt->io_private_context; + if (ps_init_handle(&ppc->handle)) { + perror("ps_init_handle"); + TRACE_ERROR("Failed to initialize ps handle.\n"); + exit(EXIT_FAILURE); + } + + /* create buffer for reading ingress batch of packet */ + if (ps_alloc_chunk(&ppc->handle, &ppc->chunk) != 0) { + perror("ps_alloc_chunk"); + TRACE_ERROR("Failed to allocate ps_chunk\n"); + exit(EXIT_FAILURE); + } + + /* create packet write chunk */ + for (i = 0; i < num_devices_attached; i++) { + ret = ps_alloc_chunk_buf(&ppc->handle, i, ctxt->cpu, &ppc->w_chunk_buf[i]); + if (ret != 0) { + TRACE_ERROR("Failed to allocate ps_chunk_buf.\n"); + exit(EXIT_FAILURE); + } + } + + gettimeofday(&cur_ts, NULL); + + /* initialize PSIO parameters */ + ppc->chunk.recv_blocking = 0; + ppc->event.timeout = PS_SELECT_TIMEOUT; + ppc->event.qidx = ctxt->cpu; + NID_ZERO(ppc->event.rx_nids); + NID_ZERO(ppc->event.tx_nids); + NID_ZERO(ppc->rx_avail); + //NID_ZERO(ppc->tx_avail); + + for (i = 0; i < CONFIG.eths_num; i++) { + ppc->last_tx_set[i] = cur_ts; + NID_SET(i, ppc->tx_avail); + } +} +/*----------------------------------------------------------------------------*/ +int psio_link_devices(struct mtcp_thread_context *ctxt) +{ + struct psio_private_context *ppc; + int ret; + int i, working; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + working = -1; + + /* attaching (device, queue) */ + for (i = 0; i < num_devices_attached; i++) { + struct ps_queue queue; + queue.ifindex = devices_attached[i]; + + if (devices[devices_attached[i]].num_rx_queues <= ctxt->cpu) { + continue; + } + + working = 0; + queue.ifindex = devices_attached[i]; + queue.qidx = ctxt->cpu; + +#if 0 + TRACE_DBG("attaching RX queue xge%d:%d to CPU%d\n", + queue.ifindex, queue.qidx, mtcp->ctxt->cpu); +#endif + ret = ps_attach_rx_device(&ppc->handle, &queue); + if (ret != 0) { + perror("ps_attach_rx_device"); + exit(1); + } + } + return working; +} +/*----------------------------------------------------------------------------*/ +void psio_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len) +{ + struct psio_private_context *ppc; + struct ps_packet packet; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + /* pass the packet to the kernel */ + packet.ifindex = ifidx; + packet.len = len; + packet.buf = (char *)pkt_data; + ps_slowpath_packet(&ppc->handle, &packet); +} +/*----------------------------------------------------------------------------*/ +static int psio_flush_pkts(struct mtcp_thread_context *ctx, int nif) +{ + struct ps_chunk_buf *c_buf; + mtcp_manager_t mtcp; + struct psio_private_context *ppc; + int send_cnt, to_send_cnt = 0; + int start_idx, i; + + ppc = (struct psio_private_context *)ctx->io_private_context; + c_buf = &ppc->w_chunk_buf[nif]; + mtcp = ctx->mtcp_manager; + + /* if chunk (for writing) is not there... then return */ + if (!c_buf) + return -1; + + to_send_cnt = c_buf->cnt; + if (to_send_cnt > 0) { + STAT_COUNT(mtcp->runstat.rounds_tx_try); + start_idx = c_buf->next_to_send; + send_cnt = ps_send_chunk_buf(&ppc->handle, c_buf); + + for (i = 0; i < send_cnt; i++) { +#ifdef NETSTAT + mtcp->nstat.tx_bytes[nif] += c_buf->info[start_idx].len + ETHER_OVR; +#endif +#if PKTDUMP + DumpPacket(mtcp, c_buf->buf + c_buf->info[start_idx].offset, c_buf->info[start_idx].len, "OUT", nif); + +#endif + start_idx = (start_idx + 1) % ENTRY_CNT; + } + if (send_cnt < 0) { + TRACE_ERROR("ps_send_chunk_buf failed. " + "ret: %d, error: %s\n", + send_cnt, strerror(errno)); +#ifdef NETSTAT + } else { + mtcp->nstat.tx_packets[nif] += send_cnt; +#endif + } + + return send_cnt; + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +int psio_send_pkts(struct mtcp_thread_context *ctxt, int nif) +{ + struct psio_private_context *ppc; + mtcp_manager_t mtcp; + int ret, prev_cnt; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + mtcp = ctxt->mtcp_manager; + +#if 0 + /* if tx if not available, pass */ + if (!NID_ISSET(nif, ppc->tx_avail)) { + NID_SET(nif, ppc->event.tx_nids); + return -1; + } +#endif + while ((prev_cnt = ppc->w_chunk_buf[nif].cnt) > 0) { + ret = psio_flush_pkts(ctxt, nif); + if (ret <= 0) { + if (ret < 0) + TRACE_ERROR("ps_send_chunk_buf failed to send.\n"); + NID_SET(nif, ppc->event.tx_nids); + NID_CLR(nif, ppc->tx_avail); + break; + } else if (ret < prev_cnt) { + NID_CLR(nif, ppc->tx_avail); + NID_SET(nif, ppc->event.tx_nids); + STAT_COUNT(mtcp->runstat.rounds_tx); + break; + } else { + STAT_COUNT(mtcp->runstat.rounds_tx); + } + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +uint8_t *psio_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t len) +{ + struct psio_private_context *ppc; + struct ps_chunk_buf *c_buf; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + c_buf = &ppc->w_chunk_buf[nif]; + + /* retrieve the right write offset */ + return (uint8_t *)ps_assign_chunk_buf(c_buf, len); +} +/*----------------------------------------------------------------------------*/ +int32_t psio_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx) +{ + int ret, no_rx_packet; + struct psio_private_context *ppc; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + no_rx_packet = 0; + + ppc->chunk.cnt = PS_CHUNK_SIZE; + + ret = ps_recv_chunk_ifidx(&ppc->handle, &ppc->chunk, ifidx); + if (ret < 0) { + if (errno != EAGAIN) { + TRACE_ERROR("ps_recv_chunk_ifidx failed to read packets.\n"); + perror("ps_recv_chunk_ifidx()"); + } + NID_SET(ifidx, ppc->event.rx_nids); + no_rx_packet = 1; + } else if (ret == 0) { + NID_SET(ifidx, ppc->event.rx_nids); + no_rx_packet = 1; + } + + if (!no_rx_packet) + NID_SET(ifidx, ppc->rx_avail); + + return ret; +} +/*----------------------------------------------------------------------------*/ +uint8_t *psio_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len) +{ + struct psio_private_context *ppc; + uint8_t *pktbuf; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + pktbuf = (uint8_t *)(ppc->chunk.buf + ppc->chunk.info[index].offset); + *len = ppc->chunk.info[index].len; + + (void)(ifidx); + return pktbuf; +} +/*----------------------------------------------------------------------------*/ +int32_t psio_select(struct mtcp_thread_context *ctxt) +{ + struct psio_private_context *ppc; + mtcp_manager_t mtcp; + struct timeval cur_ts; + int i, ret; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + mtcp = ctxt->mtcp_manager; + gettimeofday(&cur_ts, NULL); + + if (!ppc->rx_avail || ppc->event.tx_nids) { + for (i = 0; i < CONFIG.eths_num; i++) { + if (ppc->w_chunk_buf[i].cnt > 0) + NID_SET(i, ppc->event.tx_nids); + if (mtcp->n_sender[i]->control_list_cnt > 0 || mtcp->n_sender[i]->send_list_cnt > 0 || + mtcp->n_sender[i]->ack_list_cnt > 0) { + if (cur_ts.tv_sec > ppc->last_tx_set[i].tv_sec || cur_ts.tv_usec > ppc->last_tx_set[i].tv_usec) { + NID_SET(i, ppc->event.tx_nids); + ppc->last_tx_set[i] = cur_ts; + } + } + } + + TRACE_SELECT("BEFORE: rx_avail: %d, tx_avail: %d, event.rx_nids: %0x, event.tx_nids: %0x\n", ppc->rx_avail, + ppc->tx_avail, ppc->event.rx_nids, ppc->event.tx_nids); + mtcp->is_sleeping = TRUE; + ret = ps_select(&ppc->handle, &ppc->event); + mtcp->is_sleeping = FALSE; +#if TIME_STAT + gettimeofday(&select_ts, NULL); + UpdateStatCounter(&mtcp->rtstat.select, TimeDiffUs(&select_ts, &xmit_ts)); +#endif + if (ret < 0) { + if (errno != EAGAIN && errno != EINTR) { + perror("ps_select"); + exit(EXIT_FAILURE); + } + if (errno == EINTR) { + STAT_COUNT(mtcp->runstat.rounds_select_intr); + } + } else { + TRACE_SELECT("ps_select(): event.rx_nids: %0x, event.tx_nids: %0x\n", ppc->event.rx_nids, ppc->event.tx_nids); + if (ppc->event.rx_nids != 0) { + STAT_COUNT(mtcp->runstat.rounds_select_rx); + } + if (ppc->event.tx_nids != 0) { + for (i = 0; i < CONFIG.eths_num; i++) { + if (NID_ISSET(i, ppc->event.tx_nids)) { + NID_SET(i, ppc->tx_avail); + } + } + STAT_COUNT(mtcp->runstat.rounds_select_tx); + } + } + TRACE_SELECT("AFTER: rx_avail: %d, tx_avail: %d, event.rx_nids: %d, event.tx_nids: %d\n", ppc->rx_avail, ppc->tx_avail, + ppc->event.rx_nids, ppc->event.tx_nids); + STAT_COUNT(mtcp->runstat.rounds_select); + } + + /* reset psio parameters */ + ppc->event.timeout = PS_SELECT_TIMEOUT; + NID_ZERO(ppc->event.rx_nids); + NID_ZERO(ppc->event.tx_nids); + NID_ZERO(ppc->rx_avail); + //NID_ZERO(ppc->tx_avail); + + return 0; +} +/*----------------------------------------------------------------------------*/ +void psio_destroy_handle(struct mtcp_thread_context *ctxt) +{ + struct psio_private_context *ppc; + + ppc = (struct psio_private_context *)ctxt->io_private_context; + + /* free it all up */ + free(ppc); +} +/*----------------------------------------------------------------------------*/ +void psio_load_module(void) +{ + /* PSIO does not support FDIR/multi-process support */ + if (CONFIG.multi_process) { + TRACE_LOG("PSIO module does not provide multi-process support\n"); + exit(EXIT_FAILURE); + } +} +/*----------------------------------------------------------------------------*/ +io_module_func ps_module_func = { .load_module = psio_load_module, + .init_handle = psio_init_handle, + .link_devices = psio_link_devices, + .release_pkt = psio_release_pkt, + .send_pkts = psio_send_pkts, + .get_wptr = psio_get_wptr, + .recv_pkts = psio_recv_pkts, + .get_rptr = psio_get_rptr, + .select = psio_select, + .destroy_handle = psio_destroy_handle, + .dev_ioctl = NULL }; +#else +io_module_func ps_module_func = { .load_module = NULL, + .init_handle = NULL, + .link_devices = NULL, + .release_pkt = NULL, + .send_pkts = NULL, + .get_wptr = NULL, + .recv_pkts = NULL, + .get_rptr = NULL, + .select = NULL, + .destroy_handle = NULL, + .dev_ioctl = NULL }; +/*----------------------------------------------------------------------------*/ +#endif /* !DISABLE_PSIO */ diff --git a/lib/flash/mtcp/rss.c b/lib/flash/mtcp/rss.c new file mode 100644 index 0000000..b949ea8 --- /dev/null +++ b/lib/flash/mtcp/rss.c @@ -0,0 +1,135 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "rss.h" + +/*-------------------------------------------------------------*/ +static void BuildKeyCache(uint32_t *cache, int cache_len) +{ +#define NBBY 8 /* number of bits per byte */ + + /* Keys for system testing */ + static const uint8_t key[] = { 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 }; + + uint32_t result = (((uint32_t)key[0]) << 24) | (((uint32_t)key[1]) << 16) | (((uint32_t)key[2]) << 8) | ((uint32_t)key[3]); + + uint32_t idx = 32; + int i; + + for (i = 0; i < cache_len; i++, idx++) { + uint8_t shift = (idx % NBBY); + uint32_t bit; + + cache[i] = result; + bit = ((key[idx / NBBY] << shift) & 0x80) ? 1 : 0; + result = ((result << 1) | bit); + } +} +/*-------------------------------------------------------------*/ +static uint32_t GetRSSHash(in_addr_t sip, in_addr_t dip, in_port_t sp, in_port_t dp) +{ +#define MSB32 0x80000000 +#define MSB16 0x8000 +#define KEY_CACHE_LEN 96 + + uint32_t res = 0; + int i; + static int first = 1; + static uint32_t key_cache[KEY_CACHE_LEN] = { 0 }; + + if (first) { + BuildKeyCache(key_cache, KEY_CACHE_LEN); + first = 0; + } + + for (i = 0; i < 32; i++) { + if (sip & MSB32) + res ^= key_cache[i]; + sip <<= 1; + } + for (i = 0; i < 32; i++) { + if (dip & MSB32) + res ^= key_cache[32 + i]; + dip <<= 1; + } + for (i = 0; i < 16; i++) { + if (sp & MSB16) + res ^= key_cache[64 + i]; + sp <<= 1; + } + for (i = 0; i < 16; i++) { + if (dp & MSB16) + res ^= key_cache[80 + i]; + dp <<= 1; + } + return res; +} +/*-------------------------------------------------------------------*/ +/* RSS redirection table is in the little endian byte order (intel) */ +/* */ +/* idx: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19 ...*/ +/* val: 3 2 1 0 | 7 6 5 4 | 11 10 9 8 | 15 14 13 12 | 19 18 17 16 ...*/ +/* qid = val % num_queues */ +/*-------------------------------------------------------------------*/ +/* + * IXGBE (Intel X520 NIC) : (Rx queue #) = (7 LS bits of RSS hash) mod N + * I40E (Intel XL710 NIC) : (Rx queue #) = (9 LS bits of RSS hash) mod N + */ +#define RSS_BIT_MASK_IXGBE 0x0000007F +#define RSS_BIT_MASK_I40E 0x000001FF + +int GetRSSCPUCore(in_addr_t sip, in_addr_t dip, in_port_t sp, in_port_t dp, int num_queues, uint8_t endian_check) +{ + uint32_t masked; + + if (endian_check) { + /* i40e */ + static const uint32_t off[] = { 3, 1, -1, -3 }; + masked = GetRSSHash(sip, dip, sp, dp) & RSS_BIT_MASK_I40E; + masked += off[masked & 0x3]; + } else { + /* ixgbe or mlx* */ + masked = GetRSSHash(sip, dip, sp, dp) & RSS_BIT_MASK_IXGBE; + } + + return (masked % num_queues); +} +/*-------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/socket.c b/lib/flash/mtcp/socket.c new file mode 100644 index 0000000..a727938 --- /dev/null +++ b/lib/flash/mtcp/socket.c @@ -0,0 +1,118 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mtcp.h" +#include "socket.h" +#include "debug.h" + +/*---------------------------------------------------------------------------*/ +socket_map_t AllocateSocket(mctx_t mctx, int socktype, int need_lock) +{ + mtcp_manager_t mtcp = g_mtcp[mctx->cpu]; + socket_map_t socket = NULL; + + if (need_lock) + pthread_mutex_lock(&mtcp->ctx->smap_lock); + + while (socket == NULL) { + socket = TAILQ_FIRST(&mtcp->free_smap); + if (!socket) { + if (need_lock) + pthread_mutex_unlock(&mtcp->ctx->smap_lock); + + TRACE_ERROR("The concurrent sockets are at maximum.\n"); + return NULL; + } + + TAILQ_REMOVE(&mtcp->free_smap, socket, free_smap_link); + + /* if there is not invalidated events, insert the socket to the end */ + /* and find another socket in the free smap list */ + if (socket->events) { + TRACE_INFO("There are still not invalidate events remaining.\n"); + TRACE_DBG("There are still not invalidate events remaining.\n"); + TAILQ_INSERT_TAIL(&mtcp->free_smap, socket, free_smap_link); + socket = NULL; + } + } + + if (need_lock) + pthread_mutex_unlock(&mtcp->ctx->smap_lock); + + socket->socktype = socktype; + socket->opts = 0; + socket->stream = NULL; + socket->epoll = 0; + socket->events = 0; + + /* + * reset a few fields (needed for client socket) + * addr = INADDR_ANY, port = INPORT_ANY + */ + memset(&socket->saddr, 0, sizeof(struct sockaddr_in)); + memset(&socket->ep_data, 0, sizeof(mtcp_epoll_data_t)); + + return socket; +} +/*---------------------------------------------------------------------------*/ +void FreeSocket(mctx_t mctx, int sockid, int need_lock) +{ + mtcp_manager_t mtcp = g_mtcp[mctx->cpu]; + socket_map_t socket = &mtcp->smap[sockid]; + + if (socket->socktype == MTCP_SOCK_UNUSED) { + return; + } + + socket->socktype = MTCP_SOCK_UNUSED; + socket->epoll = MTCP_EPOLLNONE; + socket->events = 0; + + if (need_lock) + pthread_mutex_lock(&mtcp->ctx->smap_lock); + + /* insert into free stream map */ + mtcp->smap[sockid].stream = NULL; + TAILQ_INSERT_TAIL(&mtcp->free_smap, socket, free_smap_link); + + if (need_lock) + pthread_mutex_unlock(&mtcp->ctx->smap_lock); +} +/*---------------------------------------------------------------------------*/ +socket_map_t GetSocket(mctx_t mctx, int sockid) +{ + if (sockid < 0 || sockid >= CONFIG.max_concurrency) { + errno = EBADF; + return NULL; + } + + return &g_mtcp[mctx->cpu]->smap[sockid]; +} diff --git a/lib/flash/mtcp/tcp_in.c b/lib/flash/mtcp/tcp_in.c new file mode 100644 index 0000000..47604e3 --- /dev/null +++ b/lib/flash/mtcp/tcp_in.c @@ -0,0 +1,1290 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include "tcp_util.h" +#include "tcp_in.h" +#include "tcp_out.h" +#include "tcp_ring_buffer.h" +#include "eventpoll.h" +#include "debug.h" +#include "timer.h" +#include "ip_in.h" +#include "clock.h" +#if USE_CCP +#include "ccp.h" +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define VERIFY_RX_CHECKSUM TRUE +#define RECOVERY_AFTER_LOSS TRUE +#define SELECTIVE_WRITE_EVENT_NOTIFY TRUE + +/*----------------------------------------------------------------------------*/ +static inline int FilterSYNPacket(mtcp_manager_t mtcp, uint32_t ip, uint16_t port) +{ + struct sockaddr_in *addr; + struct tcp_listener *listener; + + /* TODO: This listening logic should be revised */ + + /* if not the address we want, drop */ + listener = (struct tcp_listener *)ListenerHTSearch(mtcp->listeners, &port); + if (listener == NULL) + return FALSE; + + addr = &listener->socket->saddr; + + if (addr->sin_port == port) { + if (addr->sin_addr.s_addr != INADDR_ANY) { + if (ip == addr->sin_addr.s_addr) { + return TRUE; + } + return FALSE; + } else { + int i; + + for (i = 0; i < CONFIG.eths_num; i++) { + if (ip == CONFIG.eths[i].ip_addr) { + return TRUE; + } + } + return FALSE; + } + } + + return FALSE; +} +/*----------------------------------------------------------------------------*/ +static inline tcp_stream *HandlePassiveOpen(mtcp_manager_t mtcp, uint32_t cur_ts, const struct iphdr *iph, const struct tcphdr *tcph, + uint32_t seq, uint16_t window) +{ + tcp_stream *cur_stream = NULL; + + /* create new stream and add to flow hash table */ + cur_stream = CreateTCPStream(mtcp, NULL, MTCP_SOCK_STREAM, iph->daddr, tcph->dest, iph->saddr, tcph->source); + if (!cur_stream) { + TRACE_ERROR("INFO: Could not allocate tcp_stream!\n"); + return FALSE; + } + cur_stream->rcvvar->irs = seq; + cur_stream->sndvar->peer_wnd = window; + cur_stream->rcv_nxt = cur_stream->rcvvar->irs; + cur_stream->sndvar->cwnd = 1; + ParseTCPOptions(cur_stream, cur_ts, (const uint8_t *)tcph + TCP_HEADER_LEN, (tcph->doff << 2) - TCP_HEADER_LEN); + + return cur_stream; +} +/*----------------------------------------------------------------------------*/ +static inline int HandleActiveOpen(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, struct tcphdr *tcph, uint32_t seq, + uint32_t ack_seq, uint16_t window) +{ + cur_stream->rcvvar->irs = seq; + cur_stream->snd_nxt = ack_seq; + cur_stream->sndvar->peer_wnd = window; + cur_stream->rcvvar->snd_wl1 = cur_stream->rcvvar->irs - 1; + cur_stream->rcv_nxt = cur_stream->rcvvar->irs + 1; + cur_stream->rcvvar->last_ack_seq = ack_seq; + ParseTCPOptions(cur_stream, cur_ts, (const uint8_t *)tcph + TCP_HEADER_LEN, (tcph->doff << 2) - TCP_HEADER_LEN); + cur_stream->sndvar->cwnd = + ((cur_stream->sndvar->cwnd == 1) ? (cur_stream->sndvar->mss * TCP_INIT_CWND) : cur_stream->sndvar->mss); + cur_stream->sndvar->ssthresh = cur_stream->sndvar->mss * 10; + UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +/* ValidateSequence: validates sequence number of the segment */ +/* Return: TRUE if acceptable, FALSE if not acceptable */ +/*----------------------------------------------------------------------------*/ +static inline int ValidateSequence(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, struct tcphdr *tcph, uint32_t seq, + uint32_t ack_seq, int payloadlen) +{ + (void)ack_seq; + /* Protect Against Wrapped Sequence number (PAWS) */ + if (!tcph->rst && cur_stream->saw_timestamp) { + struct tcp_timestamp ts; + + if (!ParseTCPTimestamp(cur_stream, &ts, (uint8_t *)tcph + TCP_HEADER_LEN, (tcph->doff << 2) - TCP_HEADER_LEN)) { + /* if there is no timestamp */ + /* TODO: implement here */ + TRACE_DBG("No timestamp found.\n"); + return FALSE; + } + + /* RFC1323: if SEG.TSval < TS.Recent, drop and send ack */ + if (TCP_SEQ_LT(ts.ts_val, cur_stream->rcvvar->ts_recent)) { + /* TODO: ts_recent should be invalidated + before timestamp wraparound for long idle flow */ + TRACE_DBG("PAWS Detect wrong timestamp. " + "seq: %u, ts_val: %u, prev: %u\n", + seq, ts.ts_val, cur_stream->rcvvar->ts_recent); + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + return FALSE; + } else { + /* valid timestamp */ + if (TCP_SEQ_GT(ts.ts_val, cur_stream->rcvvar->ts_recent)) { + TRACE_TSTAMP("Timestamp update. cur: %u, prior: %u " + "(time diff: %uus)\n", + ts.ts_val, cur_stream->rcvvar->ts_recent, + TS_TO_USEC(cur_ts - cur_stream->rcvvar->ts_last_ts_upd)); + cur_stream->rcvvar->ts_last_ts_upd = cur_ts; + } + + cur_stream->rcvvar->ts_recent = ts.ts_val; + cur_stream->rcvvar->ts_lastack_rcvd = ts.ts_ref; + } + } + + /* TCP sequence validation */ + if (!TCP_SEQ_BETWEEN(seq + payloadlen, cur_stream->rcv_nxt, cur_stream->rcv_nxt + cur_stream->rcvvar->rcv_wnd)) { + /* if RST bit is set, ignore the segment */ + if (tcph->rst) + return FALSE; + + if (cur_stream->state == TCP_ST_ESTABLISHED) { + /* check if it is to get window advertisement */ + if (seq + 1 == cur_stream->rcv_nxt) { +#if 0 + TRACE_DBG("Window update request. (seq: %u, rcv_wnd: %u)\n", + seq, cur_stream->rcvvar->rcv_wnd); +#endif + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_AGGREGATE); + return FALSE; + } + + if (TCP_SEQ_LEQ(seq, cur_stream->rcv_nxt)) { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_AGGREGATE); + } else { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + } + } else { + if (cur_stream->state == TCP_ST_TIME_WAIT) { + TRACE_DBG("Stream %d: tw expire update to %u\n", cur_stream->id, cur_stream->rcvvar->ts_tw_expire); + AddtoTimewaitList(mtcp, cur_stream, cur_ts); + } + AddtoControlList(mtcp, cur_stream, cur_ts); + } + return FALSE; + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +static inline void NotifyConnectionReset(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + (void)mtcp; + (void)cur_stream; + TRACE_DBG("Stream %d: Notifying connection reset.\n", cur_stream->id); + /* TODO: implement this function */ + /* signal to user "connection reset" */ +} +/*----------------------------------------------------------------------------*/ +static inline int ProcessRST(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t ack_seq) +{ + /* TODO: we need reset validation logic */ + /* the sequence number of a RST should be inside window */ + /* (in SYN_SENT state, it should ack the previous SYN */ + + TRACE_DBG("Stream %d: TCP RESET (%s)\n", cur_stream->id, TCPStateToString(cur_stream)); +#if defined(DUMP_STREAM) + DumpStream(mtcp, cur_stream); +#endif + + if (cur_stream->state <= TCP_ST_SYN_SENT) { + /* not handled here */ + return FALSE; + } + + if (cur_stream->state == TCP_ST_SYN_RCVD) { + if (ack_seq == cur_stream->snd_nxt) { + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_RESET; + DestroyTCPStream(mtcp, cur_stream); + } + return TRUE; + } + + /* if the application is already closed the connection, + just destroy the it */ + if (cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_FIN_WAIT_2 || cur_stream->state == TCP_ST_LAST_ACK || + cur_stream->state == TCP_ST_CLOSING || cur_stream->state == TCP_ST_TIME_WAIT) { + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_ACTIVE_CLOSE; + DestroyTCPStream(mtcp, cur_stream); + return TRUE; + } + + if (cur_stream->state >= TCP_ST_ESTABLISHED && cur_stream->state <= TCP_ST_CLOSE_WAIT) { + /* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT */ + /* TODO: flush all the segment queues */ + NotifyConnectionReset(mtcp, cur_stream); + } + + if (!(cur_stream->sndvar->on_closeq || cur_stream->sndvar->on_closeq_int || cur_stream->sndvar->on_resetq || + cur_stream->sndvar->on_resetq_int)) { + //cur_stream->state = TCP_ST_CLOSED; + //DestroyTCPStream(mtcp, cur_stream); + cur_stream->state = TCP_ST_CLOSE_WAIT; + cur_stream->close_reason = TCP_RESET; + RaiseCloseEvent(mtcp, cur_stream); + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +inline void EstimateRTT(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t mrtt) +{ + (void)mtcp; + /* This function should be called for not retransmitted packets */ + /* TODO: determine tcp_rto_min */ +#define TCP_RTO_MIN 0 + long m = mrtt; + uint32_t tcp_rto_min = TCP_RTO_MIN; + struct tcp_recv_vars *rcvvar = cur_stream->rcvvar; + + if (m == 0) { + m = 1; + } + if (rcvvar->srtt != 0) { + /* rtt = 7/8 rtt + 1/8 new */ + m -= (rcvvar->srtt >> 3); + rcvvar->srtt += m; + if (m < 0) { + m = -m; + m -= (rcvvar->mdev >> 2); + if (m > 0) { + m >>= 3; + } + } else { + m -= (rcvvar->mdev >> 2); + } + rcvvar->mdev += m; + if (rcvvar->mdev > rcvvar->mdev_max) { + rcvvar->mdev_max = rcvvar->mdev; + if (rcvvar->mdev_max > rcvvar->rttvar) { + rcvvar->rttvar = rcvvar->mdev_max; + } + } + if (TCP_SEQ_GT(cur_stream->sndvar->snd_una, rcvvar->rtt_seq)) { + if (rcvvar->mdev_max < rcvvar->rttvar) { + rcvvar->rttvar -= (rcvvar->rttvar - rcvvar->mdev_max) >> 2; + } + rcvvar->rtt_seq = cur_stream->snd_nxt; + rcvvar->mdev_max = tcp_rto_min; + } + } else { + /* fresh measurement */ + rcvvar->srtt = m << 3; + rcvvar->mdev = m << 1; + rcvvar->mdev_max = rcvvar->rttvar = MAX(rcvvar->mdev, tcp_rto_min); + rcvvar->rtt_seq = cur_stream->snd_nxt; + } + + TRACE_RTT("mrtt: %u (%uus), srtt: %u (%ums), mdev: %u, mdev_max: %u, " + "rttvar: %u, rtt_seq: %u\n", + mrtt, mrtt * TIME_TICK, rcvvar->srtt, TS_TO_MSEC((rcvvar->srtt) >> 3), rcvvar->mdev, rcvvar->mdev_max, + rcvvar->rttvar, rcvvar->rtt_seq); +} + +/*----------------------------------------------------------------------------*/ +static inline void ProcessACK(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, struct tcphdr *tcph, uint32_t seq, + uint32_t ack_seq, uint16_t window, int payloadlen) +{ + struct tcp_send_vars *sndvar = cur_stream->sndvar; + uint32_t cwindow, cwindow_prev; + uint32_t rmlen; + uint32_t snd_wnd_prev; + uint32_t right_wnd_edge; + uint8_t dup; + int ret; + + cwindow = window; + if (!tcph->syn) { + cwindow = cwindow << sndvar->wscale_peer; + } + right_wnd_edge = sndvar->peer_wnd + cur_stream->rcvvar->snd_wl2; + + /* If ack overs the sending buffer, return */ + if (cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_FIN_WAIT_2 || cur_stream->state == TCP_ST_CLOSING || + cur_stream->state == TCP_ST_CLOSE_WAIT || cur_stream->state == TCP_ST_LAST_ACK) { + if (sndvar->is_fin_sent && ack_seq == sndvar->fss + 1) { + ack_seq--; + } + } + + if (TCP_SEQ_GT(ack_seq, sndvar->sndbuf->head_seq + sndvar->sndbuf->len)) { + TRACE_DBG("Stream %d (%s): invalid acknologement. " + "ack_seq: %u, possible max_ack_seq: %u\n", + cur_stream->id, TCPStateToString(cur_stream), ack_seq, sndvar->sndbuf->head_seq + sndvar->sndbuf->len); + return; + } + + /* Update window */ + if (TCP_SEQ_LT(cur_stream->rcvvar->snd_wl1, seq) || + (cur_stream->rcvvar->snd_wl1 == seq && TCP_SEQ_LT(cur_stream->rcvvar->snd_wl2, ack_seq)) || + (cur_stream->rcvvar->snd_wl2 == ack_seq && cwindow > sndvar->peer_wnd)) { + cwindow_prev = sndvar->peer_wnd; + sndvar->peer_wnd = cwindow; + cur_stream->rcvvar->snd_wl1 = seq; + cur_stream->rcvvar->snd_wl2 = ack_seq; +#if 0 + TRACE_CLWND("Window update. " + "ack: %u, peer_wnd: %u, snd_nxt-snd_una: %u\n", + ack_seq, cwindow, cur_stream->snd_nxt - sndvar->snd_una); +#endif + if (cwindow_prev < cur_stream->snd_nxt - sndvar->snd_una && + sndvar->peer_wnd >= cur_stream->snd_nxt - sndvar->snd_una) { + TRACE_CLWND("%u Broadcasting client window update! " + "ack_seq: %u, peer_wnd: %u (before: %u), " + "(snd_nxt - snd_una: %u)\n", + cur_stream->id, ack_seq, sndvar->peer_wnd, cwindow_prev, cur_stream->snd_nxt - sndvar->snd_una); + RaiseWriteEvent(mtcp, cur_stream); + } + } + + /* Check duplicated ack count */ + /* Duplicated ack if + 1) ack_seq is old + 2) payload length is 0. + 3) advertised window not changed. + 4) there is outstanding unacknowledged data + 5) ack_seq == snd_una + */ + + dup = FALSE; + if (TCP_SEQ_LT(ack_seq, cur_stream->snd_nxt)) { + if (ack_seq == cur_stream->rcvvar->last_ack_seq && payloadlen == 0) { + if (cur_stream->rcvvar->snd_wl2 + sndvar->peer_wnd == right_wnd_edge) { + if (cur_stream->rcvvar->dup_acks + 1 > cur_stream->rcvvar->dup_acks) { + cur_stream->rcvvar->dup_acks++; +#if USE_CCP + ccp_record_event(mtcp, cur_stream, EVENT_DUPACK, (cur_stream->snd_nxt - ack_seq)); +#endif + } + dup = TRUE; + } + } + } + if (!dup) { +#if USE_CCP + if (cur_stream->rcvvar->dup_acks >= 3) { + TRACE_DBG("passed dup_acks, ack=%u, snd_nxt=%u, last_ack=%u len=%u wl2=%u peer_wnd=%u right=%u\n", + ack_seq - sndvar->iss, cur_stream->snd_nxt - sndvar->iss, + cur_stream->rcvvar->last_ack_seq - sndvar->iss, payloadlen, + cur_stream->rcvvar->snd_wl2 - sndvar->iss, sndvar->peer_wnd / sndvar->mss, + right_wnd_edge - sndvar->iss); + } +#endif + cur_stream->rcvvar->dup_acks = 0; + cur_stream->rcvvar->last_ack_seq = ack_seq; + } +#if USE_CCP + if (cur_stream->wait_for_acks) { + TRACE_DBG("got ack, but waiting to send... ack=%u, snd_next=%u cwnd=%u\n", ack_seq - sndvar->iss, + cur_stream->snd_nxt - sndvar->iss, sndvar->cwnd / sndvar->mss); + } +#endif + /* Fast retransmission */ + if (dup && cur_stream->rcvvar->dup_acks == 3) { + TRACE_LOSS("Triple duplicated ACKs!! ack_seq: %u\n", ack_seq); + TRACE_CCP("tridup ack %u (%u)!\n", ack_seq - cur_stream->sndvar->iss, ack_seq); + if (TCP_SEQ_LT(ack_seq, cur_stream->snd_nxt)) { + TRACE_LOSS("Reducing snd_nxt from %u to %u\n", cur_stream->snd_nxt - sndvar->iss, + ack_seq - cur_stream->sndvar->iss); + +#if RTM_STAT + sndvar->rstat.tdp_ack_cnt++; + sndvar->rstat.tdp_ack_bytes += (cur_stream->snd_nxt - ack_seq); +#endif + +#if USE_CCP + ccp_record_event(mtcp, cur_stream, EVENT_TRI_DUPACK, ack_seq); +#endif + if (ack_seq != sndvar->snd_una) { + TRACE_DBG("ack_seq and snd_una mismatch on tdp ack. " + "ack_seq: %u, snd_una: %u\n", + ack_seq, sndvar->snd_una); + } +#if USE_CCP + sndvar->missing_seq = ack_seq; +#else + cur_stream->snd_nxt = ack_seq; +#endif + } + + /* update congestion control variables */ + /* ssthresh to half of min of cwnd and peer wnd */ + sndvar->ssthresh = MIN(sndvar->cwnd, sndvar->peer_wnd) / 2; + if (sndvar->ssthresh < 2 * sndvar->mss) { + sndvar->ssthresh = 2 * sndvar->mss; + } + sndvar->cwnd = sndvar->ssthresh + 3 * sndvar->mss; + + TRACE_CONG("fast retrans: cwnd = ssthresh(%u)+3*mss = %u\n", sndvar->ssthresh / sndvar->mss, + sndvar->cwnd / sndvar->mss); + + /* count number of retransmissions */ + if (sndvar->nrtx < TCP_MAX_RTX) { + sndvar->nrtx++; + } else { + TRACE_DBG("Exceed MAX_RTX.\n"); + } + + AddtoSendList(mtcp, cur_stream); + + } else if (cur_stream->rcvvar->dup_acks > 3) { + /* Inflate congestion window until before overflow */ + if ((uint32_t)(sndvar->cwnd + sndvar->mss) > sndvar->cwnd) { + sndvar->cwnd += sndvar->mss; + TRACE_CONG("Dupack cwnd inflate. cwnd: %u, ssthresh: %u\n", sndvar->cwnd, sndvar->ssthresh); + } + } + +#if TCP_OPT_SACK_ENABLED + ParseSACKOption(cur_stream, ack_seq, (uint8_t *)tcph + TCP_HEADER_LEN, (tcph->doff << 2) - TCP_HEADER_LEN); +#endif /* TCP_OPT_SACK_ENABLED */ + +#if RECOVERY_AFTER_LOSS +#if USE_CCP + /* updating snd_nxt (when recovered from loss) */ + if (TCP_SEQ_GT(ack_seq, cur_stream->snd_nxt) || (cur_stream->wait_for_acks && TCP_SEQ_GT(ack_seq, cur_stream->seq_at_last_loss) +#if TCP_OPT_SACK_ENABLED + && cur_stream->rcvvar->sacked_pkts == 0 +#endif + )) +#else + if (TCP_SEQ_GT(ack_seq, cur_stream->snd_nxt)) +#endif /* USE_CCP */ + { +#if RTM_STAT + sndvar->rstat.ack_upd_cnt++; + sndvar->rstat.ack_upd_bytes += (ack_seq - cur_stream->snd_nxt); +#endif + // fast retransmission exit: cwnd=ssthresh + cur_stream->sndvar->cwnd = cur_stream->sndvar->ssthresh; + + TRACE_LOSS("Updating snd_nxt from %u to %u\n", cur_stream->snd_nxt, ack_seq); +#if USE_CCP + cur_stream->wait_for_acks = FALSE; +#endif + cur_stream->snd_nxt = ack_seq; + TRACE_DBG("Sending again..., ack_seq=%u sndlen=%u cwnd=%u\n", ack_seq - sndvar->iss, sndvar->sndbuf->len, + sndvar->cwnd / sndvar->mss); + if (sndvar->sndbuf->len == 0) { + RemoveFromSendList(mtcp, cur_stream); + } else { + AddtoSendList(mtcp, cur_stream); + } + } +#endif /* RECOVERY_AFTER_LOSS */ + + rmlen = ack_seq - sndvar->sndbuf->head_seq; + uint16_t packets = rmlen / sndvar->eff_mss; + if (packets * sndvar->eff_mss > rmlen) { + packets++; + } + +#if USE_CCP + ccp_cong_control(mtcp, cur_stream, ack_seq, rmlen, packets); +#else + // log_cwnd_rtt(cur_stream); +#endif + + /* If ack_seq is previously acked, return */ + if (TCP_SEQ_GEQ(sndvar->sndbuf->head_seq, ack_seq)) { + return; + } + + /* Remove acked sequence from send buffer */ + if (rmlen > 0) { + /* Routine goes here only if there is new payload (not retransmitted) */ + + /* Estimate RTT and calculate rto */ + if (cur_stream->saw_timestamp) { + EstimateRTT(mtcp, cur_stream, cur_ts - cur_stream->rcvvar->ts_lastack_rcvd); + sndvar->rto = (cur_stream->rcvvar->srtt >> 3) + cur_stream->rcvvar->rttvar; + assert(sndvar->rto > 0); + } else { + //TODO: Need to implement timestamp estimation without timestamp + TRACE_RTT("NOT IMPLEMENTED.\n"); + } + + // TODO CCP should comment this out? + /* Update congestion control variables */ + if (cur_stream->state >= TCP_ST_ESTABLISHED) { + if (sndvar->cwnd < sndvar->ssthresh) { + if ((sndvar->cwnd + sndvar->mss) > sndvar->cwnd) { + sndvar->cwnd += (sndvar->mss * packets); + } + TRACE_CONG("slow start cwnd: %u, ssthresh: %u\n", sndvar->cwnd, sndvar->ssthresh); + } else { + uint32_t new_cwnd = sndvar->cwnd + packets * sndvar->mss * sndvar->mss / sndvar->cwnd; + if (new_cwnd > sndvar->cwnd) { + sndvar->cwnd = new_cwnd; + } + //TRACE_CONG("congestion avoidance cwnd: %u, ssthresh: %u\n", + // sndvar->cwnd, sndvar->ssthresh); + } + } + + if (SBUF_LOCK(&sndvar->write_lock)) { + if (errno == EDEADLK) + perror("ProcessACK: write_lock blocked\n"); + assert(0); + } + ret = SBRemove(mtcp->rbm_snd, sndvar->sndbuf, rmlen); + sndvar->snd_una = ack_seq; + snd_wnd_prev = sndvar->snd_wnd; + sndvar->snd_wnd = sndvar->sndbuf->size - sndvar->sndbuf->len; + + /* If there was no available sending window */ + /* notify the newly available window to application */ +#if SELECTIVE_WRITE_EVENT_NOTIFY + if (snd_wnd_prev <= 0) { +#endif /* SELECTIVE_WRITE_EVENT_NOTIFY */ + RaiseWriteEvent(mtcp, cur_stream); +#if SELECTIVE_WRITE_EVENT_NOTIFY + } +#endif /* SELECTIVE_WRITE_EVENT_NOTIFY */ + + SBUF_UNLOCK(&sndvar->write_lock); + UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + } + + UNUSED(ret); +} +/*----------------------------------------------------------------------------*/ +/* ProcessTCPPayload: merges TCP payload using receive ring buffer */ +/* Return: TRUE (1) in normal case, FALSE (0) if immediate ACK is required */ +/* CAUTION: should only be called at ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 */ +/*----------------------------------------------------------------------------*/ +static inline int ProcessTCPPayload(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, uint8_t *payload, uint32_t seq, + int payloadlen) +{ + (void)cur_ts; + struct tcp_recv_vars *rcvvar = cur_stream->rcvvar; + uint32_t prev_rcv_nxt; + int ret; + + /* if seq and segment length is lower than rcv_nxt, ignore and send ack */ + if (TCP_SEQ_LT(seq + payloadlen, cur_stream->rcv_nxt)) { + return FALSE; + } + /* if payload exceeds receiving buffer, drop and send ack */ + if (TCP_SEQ_GT(seq + payloadlen, cur_stream->rcv_nxt + rcvvar->rcv_wnd)) { + return FALSE; + } + + /* allocate receive buffer if not exist */ + if (!rcvvar->rcvbuf) { + rcvvar->rcvbuf = RBInit(mtcp->rbm_rcv, rcvvar->irs + 1); + if (!rcvvar->rcvbuf) { + TRACE_ERROR("Stream %d: Failed to allocate receive buffer.\n", cur_stream->id); + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_NO_MEM; + RaiseErrorEvent(mtcp, cur_stream); + + return ERROR; + } + } + + if (SBUF_LOCK(&rcvvar->read_lock)) { + if (errno == EDEADLK) + perror("ProcessTCPPayload: read_lock blocked\n"); + assert(0); + } + + prev_rcv_nxt = cur_stream->rcv_nxt; + ret = RBPut(mtcp->rbm_rcv, rcvvar->rcvbuf, payload, (uint32_t)payloadlen, seq); + if (ret < 0) { + TRACE_ERROR("Cannot merge payload. reason: %d\n", ret); + } + + /* discard the buffer if the state is FIN_WAIT_1 or FIN_WAIT_2, + meaning that the connection is already closed by the application */ + if (cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_FIN_WAIT_2) { + RBRemove(mtcp->rbm_rcv, rcvvar->rcvbuf, rcvvar->rcvbuf->merged_len, AT_MTCP); + } + cur_stream->rcv_nxt = rcvvar->rcvbuf->head_seq + rcvvar->rcvbuf->merged_len; + rcvvar->rcv_wnd = rcvvar->rcvbuf->size - rcvvar->rcvbuf->merged_len; + + SBUF_UNLOCK(&rcvvar->read_lock); + + if (TCP_SEQ_LEQ(cur_stream->rcv_nxt, prev_rcv_nxt)) { + /* There are some lost packets */ + return FALSE; + } + + TRACE_EPOLL("Stream %d data arrived. " + "len: %d, ET: %u, IN: %u, OUT: %u\n", + cur_stream->id, payloadlen, cur_stream->socket ? cur_stream->socket->epoll & MTCP_EPOLLET : 0, + cur_stream->socket ? cur_stream->socket->epoll & MTCP_EPOLLIN : 0, + cur_stream->socket ? cur_stream->socket->epoll & MTCP_EPOLLOUT : 0); + + if (cur_stream->state == TCP_ST_ESTABLISHED) { + RaiseReadEvent(mtcp, cur_stream); + } + + return TRUE; +} +/*----------------------------------------------------------------------------*/ +static inline tcp_stream *CreateNewFlowHTEntry(mtcp_manager_t mtcp, uint32_t cur_ts, const struct iphdr *iph, int ip_len, + const struct tcphdr *tcph, uint32_t seq, uint32_t ack_seq, int payloadlen, + uint16_t window) +{ + (void)ip_len; + tcp_stream *cur_stream; + int ret; + + if (tcph->syn && !tcph->ack) { + /* handle the SYN */ + ret = FilterSYNPacket(mtcp, iph->daddr, tcph->dest); + if (!ret) { + TRACE_DBG("Refusing SYN packet.\n"); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, 0, seq + payloadlen + 1, 0, + TCP_FLAG_RST | TCP_FLAG_ACK, NULL, 0, cur_ts, 0); + + return NULL; + } + + /* now accept the connection */ + cur_stream = HandlePassiveOpen(mtcp, cur_ts, iph, tcph, seq, window); + if (!cur_stream) { + TRACE_DBG("Not available space in flow pool.\n"); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, 0, seq + payloadlen + 1, 0, + TCP_FLAG_RST | TCP_FLAG_ACK, NULL, 0, cur_ts, 0); + + return NULL; + } + + return cur_stream; + } else if (tcph->rst) { + TRACE_DBG("Reset packet comes\n"); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif + /* for the reset packet, just discard */ + return NULL; + } else { + TRACE_DBG("Weird packet comes.\n"); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif + /* TODO: for else, discard and send a RST */ + /* if the ACK bit is off, respond with seq 0: + + else (ACK bit is on): + + */ + if (tcph->ack) { + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, ack_seq, 0, 0, TCP_FLAG_RST, + NULL, 0, cur_ts, 0); + } else { + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, 0, seq + payloadlen, 0, + TCP_FLAG_RST | TCP_FLAG_ACK, NULL, 0, cur_ts, 0); + } + return NULL; + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_LISTEN(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph) +{ + if (tcph->syn) { + if (cur_stream->state == TCP_ST_LISTEN) + cur_stream->rcv_nxt++; + cur_stream->state = TCP_ST_SYN_RCVD; + TRACE_STATE("Stream %d: TCP_ST_SYN_RCVD\n", cur_stream->id); + AddtoControlList(mtcp, cur_stream, cur_ts); + } else { + CTRACE_ERROR("Stream %d (TCP_ST_LISTEN): " + "Packet without SYN.\n", + cur_stream->id); + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_SYN_SENT(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, const struct iphdr *iph, + struct tcphdr *tcph, uint32_t seq, uint32_t ack_seq, int payloadlen, uint16_t window) +{ + /* when active open */ + if (tcph->ack) { + /* filter the unacceptable acks */ + if (TCP_SEQ_LEQ(ack_seq, cur_stream->sndvar->iss) || TCP_SEQ_GT(ack_seq, cur_stream->snd_nxt)) { + if (!tcph->rst) { + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, ack_seq, 0, 0, + TCP_FLAG_RST, NULL, 0, cur_ts, 0); + } + return; + } + /* accept the ack */ + cur_stream->sndvar->snd_una++; + } + + if (tcph->rst) { + if (tcph->ack) { + cur_stream->state = TCP_ST_CLOSE_WAIT; + cur_stream->close_reason = TCP_RESET; + if (cur_stream->socket) { + RaiseErrorEvent(mtcp, cur_stream); + } else { + DestroyTCPStream(mtcp, cur_stream); + } + } + return; + } + + if (tcph->syn) { + if (tcph->ack) { + int ret = HandleActiveOpen(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window); + if (!ret) { + return; + } + + cur_stream->sndvar->nrtx = 0; + cur_stream->rcv_nxt = cur_stream->rcvvar->irs + 1; + RemoveFromRTOList(mtcp, cur_stream); + cur_stream->state = TCP_ST_ESTABLISHED; + TRACE_STATE("Stream %d: TCP_ST_ESTABLISHED\n", cur_stream->id); + + if (cur_stream->socket) { + RaiseWriteEvent(mtcp, cur_stream); + } else { + TRACE_STATE("Stream %d: ESTABLISHED, but no socket\n", cur_stream->id); + SendTCPPacketStandalone(mtcp, iph->daddr, tcph->dest, iph->saddr, tcph->source, 0, + seq + payloadlen + 1, 0, TCP_FLAG_RST | TCP_FLAG_ACK, NULL, 0, cur_ts, 0); + cur_stream->close_reason = TCP_ACTIVE_CLOSE; + DestroyTCPStream(mtcp, cur_stream); + return; + } + AddtoControlList(mtcp, cur_stream, cur_ts); + if (CONFIG.tcp_timeout > 0) + AddtoTimeoutList(mtcp, cur_stream); + + } else { + cur_stream->state = TCP_ST_SYN_RCVD; + TRACE_STATE("Stream %d: TCP_ST_SYN_RCVD\n", cur_stream->id); + cur_stream->snd_nxt = cur_stream->sndvar->iss; + AddtoControlList(mtcp, cur_stream, cur_ts); + } + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_SYN_RCVD(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t ack_seq) +{ + struct tcp_send_vars *sndvar = cur_stream->sndvar; + int ret; + if (tcph->ack) { + struct tcp_listener *listener; + uint32_t prior_cwnd; + /* check if ACK of SYN */ + if (ack_seq != sndvar->iss + 1) { + CTRACE_ERROR("Stream %d (TCP_ST_SYN_RCVD): " + "weird ack_seq: %u, iss: %u\n", + cur_stream->id, ack_seq, sndvar->iss); + TRACE_DBG("Stream %d (TCP_ST_SYN_RCVD): " + "weird ack_seq: %u, iss: %u\n", + cur_stream->id, ack_seq, sndvar->iss); + return; + } + + sndvar->snd_una++; + cur_stream->snd_nxt = ack_seq; + prior_cwnd = sndvar->cwnd; + sndvar->cwnd = ((prior_cwnd == 1) ? (sndvar->mss * TCP_INIT_CWND) : sndvar->mss); + TRACE_DBG("sync_recvd: updating cwnd from %u to %u\n", prior_cwnd, sndvar->cwnd); + + //UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + sndvar->nrtx = 0; + cur_stream->rcv_nxt = cur_stream->rcvvar->irs + 1; + RemoveFromRTOList(mtcp, cur_stream); + + cur_stream->state = TCP_ST_ESTABLISHED; + TRACE_STATE("Stream %d: TCP_ST_ESTABLISHED\n", cur_stream->id); + + /* update listening socket */ + listener = (struct tcp_listener *)ListenerHTSearch(mtcp->listeners, &tcph->dest); + + ret = StreamEnqueue(listener->acceptq, cur_stream); + if (ret < 0) { + TRACE_ERROR("Stream %d: Failed to enqueue to " + "the listen backlog!\n", + cur_stream->id); + cur_stream->close_reason = TCP_NOT_ACCEPTED; + cur_stream->state = TCP_ST_CLOSED; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", cur_stream->id); + AddtoControlList(mtcp, cur_stream, cur_ts); + } + //TRACE_DBG("Stream %d inserted into acceptq.\n", cur_stream->id); + if (CONFIG.tcp_timeout > 0) + AddtoTimeoutList(mtcp, cur_stream); + + /* raise an event to the listening socket */ + if (listener->socket && (listener->socket->epoll & MTCP_EPOLLIN)) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, listener->socket, MTCP_EPOLLIN); + } + + } else { + TRACE_DBG("Stream %d (TCP_ST_SYN_RCVD): No ACK.\n", cur_stream->id); + /* retransmit SYN/ACK */ + cur_stream->snd_nxt = sndvar->iss; + AddtoControlList(mtcp, cur_stream, cur_ts); + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_ESTABLISHED(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, uint8_t *payload, int payloadlen, uint16_t window) +{ + if (tcph->syn) { + TRACE_DBG("Stream %d (TCP_ST_ESTABLISHED): weird SYN. " + "seq: %u, expected: %u, ack_seq: %u, expected: %u\n", + cur_stream->id, seq, cur_stream->rcv_nxt, ack_seq, cur_stream->snd_nxt); + cur_stream->snd_nxt = ack_seq; + AddtoControlList(mtcp, cur_stream, cur_ts); + return; + } + + if (payloadlen > 0) { + if (ProcessTCPPayload(mtcp, cur_stream, cur_ts, payload, seq, payloadlen)) { + /* if return is TRUE, send ACK */ + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_AGGREGATE); + } else { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + } + } + + if (tcph->ack) { + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } + } + + if (tcph->fin) { + /* process the FIN only if the sequence is valid */ + /* FIN packet is allowed to push payload (should we check for PSH flag)? */ + if (seq + payloadlen == cur_stream->rcv_nxt) { + cur_stream->state = TCP_ST_CLOSE_WAIT; + TRACE_STATE("Stream %d: TCP_ST_CLOSE_WAIT\n", cur_stream->id); + cur_stream->rcv_nxt++; + AddtoControlList(mtcp, cur_stream, cur_ts); + + /* notify FIN to application */ + RaiseReadEvent(mtcp, cur_stream); + } else { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + return; + } + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_CLOSE_WAIT(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, int payloadlen, uint16_t window) +{ + if (TCP_SEQ_LT(seq, cur_stream->rcv_nxt)) { + TRACE_DBG("Stream %d (TCP_ST_CLOSE_WAIT): " + "weird seq: %u, expected: %u\n", + cur_stream->id, seq, cur_stream->rcv_nxt); + AddtoControlList(mtcp, cur_stream, cur_ts); + return; + } + + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_LAST_ACK(mtcp_manager_t mtcp, uint32_t cur_ts, const struct iphdr *iph, int ip_len, + tcp_stream *cur_stream, struct tcphdr *tcph, uint32_t seq, uint32_t ack_seq, int payloadlen, + uint16_t window) +{ + (void)iph; + (void)ip_len; + if (TCP_SEQ_LT(seq, cur_stream->rcv_nxt)) { + TRACE_DBG("Stream %d (TCP_ST_LAST_ACK): " + "weird seq: %u, expected: %u\n", + cur_stream->id, seq, cur_stream->rcv_nxt); + return; + } + + if (tcph->ack) { + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } + + if (!cur_stream->sndvar->is_fin_sent) { + /* the case that FIN is not sent yet */ + /* this is not ack for FIN, ignore */ + TRACE_DBG("Stream %d (TCP_ST_LAST_ACK): " + "No FIN sent yet.\n", + cur_stream->id); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif +#if defined(DUMP_STREAM) + DumpStream(mtcp, cur_stream); + DumpControlList(mtcp, mtcp->n_sender[0]); +#endif + return; + } + + /* check if ACK of FIN */ + if (ack_seq == cur_stream->sndvar->fss + 1) { + cur_stream->sndvar->snd_una++; + UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_PASSIVE_CLOSE; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", cur_stream->id); + DestroyTCPStream(mtcp, cur_stream); + } else { + TRACE_DBG("Stream %d (TCP_ST_LAST_ACK): Not ACK of FIN. " + "ack_seq: %u, expected: %u\n", + cur_stream->id, ack_seq, cur_stream->sndvar->fss + 1); + //cur_stream->snd_nxt = cur_stream->sndvar->fss; + AddtoControlList(mtcp, cur_stream, cur_ts); + } + } else { + CTRACE_ERROR("Stream %d (TCP_ST_LAST_ACK): No ACK\n", cur_stream->id); + //cur_stream->snd_nxt = cur_stream->sndvar->fss; + AddtoControlList(mtcp, cur_stream, cur_ts); + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_FIN_WAIT_1(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, uint8_t *payload, int payloadlen, uint16_t window) +{ + if (TCP_SEQ_LT(seq, cur_stream->rcv_nxt)) { + TRACE_DBG("Stream %d (TCP_ST_LAST_ACK): " + "weird seq: %u, expected: %u\n", + cur_stream->id, seq, cur_stream->rcv_nxt); + AddtoControlList(mtcp, cur_stream, cur_ts); + return; + } + + if (tcph->ack) { + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } + + if (cur_stream->sndvar->is_fin_sent && ack_seq == cur_stream->sndvar->fss + 1) { + cur_stream->sndvar->snd_una = ack_seq; + if (TCP_SEQ_GT(ack_seq, cur_stream->snd_nxt)) { + TRACE_DBG("Stream %d: update snd_nxt to %u\n", cur_stream->id, ack_seq); + cur_stream->snd_nxt = ack_seq; + } + //cur_stream->sndvar->snd_una++; + //UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + cur_stream->sndvar->nrtx = 0; + RemoveFromRTOList(mtcp, cur_stream); + cur_stream->state = TCP_ST_FIN_WAIT_2; + TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_2\n", cur_stream->id); + } + + } else { + TRACE_DBG("Stream %d: does not contain an ack!\n", cur_stream->id); + return; + } + + if (payloadlen > 0) { + if (ProcessTCPPayload(mtcp, cur_stream, cur_ts, payload, seq, payloadlen)) { + /* if return is TRUE, send ACK */ + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_AGGREGATE); + } else { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + } + } + + if (tcph->fin) { + /* process the FIN only if the sequence is valid */ + /* FIN packet is allowed to push payload (should we check for PSH flag)? */ + if (seq + payloadlen == cur_stream->rcv_nxt) { + cur_stream->rcv_nxt++; + + if (cur_stream->state == TCP_ST_FIN_WAIT_1) { + cur_stream->state = TCP_ST_CLOSING; + TRACE_STATE("Stream %d: TCP_ST_CLOSING\n", cur_stream->id); + + } else if (cur_stream->state == TCP_ST_FIN_WAIT_2) { + cur_stream->state = TCP_ST_TIME_WAIT; + TRACE_STATE("Stream %d: TCP_ST_TIME_WAIT\n", cur_stream->id); + AddtoTimewaitList(mtcp, cur_stream, cur_ts); + } + AddtoControlList(mtcp, cur_stream, cur_ts); + } + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_FIN_WAIT_2(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, uint8_t *payload, int payloadlen, uint16_t window) +{ + if (tcph->ack) { + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } + } else { + TRACE_DBG("Stream %d: does not contain an ack!\n", cur_stream->id); + return; + } + + if (payloadlen > 0) { + if (ProcessTCPPayload(mtcp, cur_stream, cur_ts, payload, seq, payloadlen)) { + /* if return is TRUE, send ACK */ + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_AGGREGATE); + } else { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_NOW); + } + } + + if (tcph->fin) { + /* process the FIN only if the sequence is valid */ + /* FIN packet is allowed to push payload (should we check for PSH flag)? */ + if (seq + payloadlen == cur_stream->rcv_nxt) { + cur_stream->state = TCP_ST_TIME_WAIT; + cur_stream->rcv_nxt++; + TRACE_STATE("Stream %d: TCP_ST_TIME_WAIT\n", cur_stream->id); + + AddtoTimewaitList(mtcp, cur_stream, cur_ts); + AddtoControlList(mtcp, cur_stream, cur_ts); + } +#if 0 + } else { + TRACE_DBG("Stream %d (TCP_ST_FIN_WAIT_2): No FIN. " + "seq: %u, ack_seq: %u, snd_nxt: %u, snd_una: %u\n", + cur_stream->id, seq, ack_seq, + cur_stream->snd_nxt, cur_stream->sndvar->snd_una); +#if DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif +#endif + } +} +/*----------------------------------------------------------------------------*/ +static inline void Handle_TCP_ST_CLOSING(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream, struct tcphdr *tcph, + uint32_t seq, uint32_t ack_seq, int payloadlen, uint16_t window) +{ + if (tcph->ack) { + if (cur_stream->sndvar->sndbuf) { + ProcessACK(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, window, payloadlen); + } + + if (!cur_stream->sndvar->is_fin_sent) { + TRACE_DBG("Stream %d (TCP_ST_CLOSING): " + "No FIN sent yet.\n", + cur_stream->id); + return; + } + + // check if ACK of FIN + if (ack_seq != cur_stream->sndvar->fss + 1) { +#if 0 + CTRACE_ERROR("Stream %d (TCP_ST_CLOSING): Not ACK of FIN. " + "ack_seq: %u, snd_nxt: %u, snd_una: %u, fss: %u\n", + cur_stream->id, ack_seq, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una, cur_stream->sndvar->fss); + DumpIPPacketToFile(stderr, iph, ip_len); + DumpStream(mtcp, cur_stream); +#endif + //assert(0); + /* if the packet is not the ACK of FIN, ignore */ + return; + } + + cur_stream->sndvar->snd_una = ack_seq; + cur_stream->snd_nxt = ack_seq; + UpdateRetransmissionTimer(mtcp, cur_stream, cur_ts); + + cur_stream->state = TCP_ST_TIME_WAIT; + TRACE_STATE("Stream %d: TCP_ST_TIME_WAIT\n", cur_stream->id); + + AddtoTimewaitList(mtcp, cur_stream, cur_ts); + + } else { + CTRACE_ERROR("Stream %d (TCP_ST_CLOSING): Not ACK\n", cur_stream->id); + return; + } +} +/*----------------------------------------------------------------------------*/ +int ProcessTCPPacket(mtcp_manager_t mtcp, uint32_t cur_ts, const int ifidx, const struct iphdr *iph, int ip_len) +{ + struct tcphdr *tcph = (struct tcphdr *)(uintptr_t)((const u_char *)iph + (iph->ihl << 2)); + uint8_t *payload = (uint8_t *)tcph + (tcph->doff << 2); + int payloadlen = ip_len - (payload - (const u_char *)iph); + tcp_stream s_stream; + tcp_stream *cur_stream = NULL; + uint32_t seq = ntohl(tcph->seq); + uint32_t ack_seq = ntohl(tcph->ack_seq); + uint16_t window = ntohs(tcph->window); + uint16_t check; + int ret; + int rc = -1; + + /* Check ip packet invalidation */ + if (ip_len < ((iph->ihl + tcph->doff) << 2)) + return ERROR; + +#if VERIFY_RX_CHECKSUM +#ifndef DISABLE_HWCSUM + if (mtcp->iom->dev_ioctl != NULL) + rc = mtcp->iom->dev_ioctl(mtcp->ctx, ifidx, PKT_RX_TCP_CSUM, NULL); +#endif + if (rc == -1) { + check = TCPCalcChecksum((uint16_t *)tcph, (tcph->doff << 2) + payloadlen, iph->saddr, iph->daddr); + if (check) { + TRACE_DBG("Checksum Error: Original: 0x%04x, calculated: 0x%04x\n", tcph->check, + TCPCalcChecksum((uint16_t *)tcph, (tcph->doff << 2) + payloadlen, iph->saddr, iph->daddr)); + tcph->check = 0; + return ERROR; + } + } +#endif + +#if defined(NETSTAT) && defined(ENABLELRO) + mtcp->nstat.rx_gdptbytes += payloadlen; +#endif /* NETSTAT */ + + s_stream.saddr = iph->daddr; + s_stream.sport = tcph->dest; + s_stream.daddr = iph->saddr; + s_stream.dport = tcph->source; + + if (!(cur_stream = StreamHTSearch(mtcp->tcp_flow_table, &s_stream))) { + /* not found in flow table */ + cur_stream = CreateNewFlowHTEntry(mtcp, cur_ts, iph, ip_len, tcph, seq, ack_seq, payloadlen, window); + if (!cur_stream) + return TRUE; + } + + /* Validate sequence. if not valid, ignore the packet */ + if (cur_stream->state > TCP_ST_SYN_RCVD) { + ret = ValidateSequence(mtcp, cur_stream, cur_ts, tcph, seq, ack_seq, payloadlen); + if (!ret) { + TRACE_DBG("Stream %d: Unexpected sequence: %u, expected: %u\n", cur_stream->id, seq, cur_stream->rcv_nxt); +#ifdef DBGMSG + DumpIPPacket(mtcp, iph, ip_len); +#endif +#ifdef DUMP_STREAM + DumpStream(mtcp, cur_stream); +#endif + return TRUE; + } + } + + /* Update receive window size */ + if (tcph->syn) { + cur_stream->sndvar->peer_wnd = window; + } else { + cur_stream->sndvar->peer_wnd = (uint32_t)window << cur_stream->sndvar->wscale_peer; + } + + cur_stream->last_active_ts = cur_ts; + UpdateTimeoutList(mtcp, cur_stream); + + /* Process RST: process here only if state > TCP_ST_SYN_SENT */ + if (tcph->rst) { + cur_stream->have_reset = TRUE; + if (cur_stream->state > TCP_ST_SYN_SENT) { + if (ProcessRST(mtcp, cur_stream, ack_seq)) { + return TRUE; + } + } + } + + switch (cur_stream->state) { + case TCP_ST_LISTEN: + Handle_TCP_ST_LISTEN(mtcp, cur_ts, cur_stream, tcph); + break; + + case TCP_ST_SYN_SENT: + Handle_TCP_ST_SYN_SENT(mtcp, cur_ts, cur_stream, iph, tcph, seq, ack_seq, payloadlen, window); + break; + + case TCP_ST_SYN_RCVD: + /* SYN retransmit implies our SYN/ACK was lost. Resend */ + if (tcph->syn && seq == cur_stream->rcvvar->irs) + Handle_TCP_ST_LISTEN(mtcp, cur_ts, cur_stream, tcph); + else { + Handle_TCP_ST_SYN_RCVD(mtcp, cur_ts, cur_stream, tcph, ack_seq); + if (payloadlen > 0 && cur_stream->state == TCP_ST_ESTABLISHED) { + Handle_TCP_ST_ESTABLISHED(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payload, payloadlen, window); + } + } + break; + + case TCP_ST_ESTABLISHED: + Handle_TCP_ST_ESTABLISHED(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payload, payloadlen, window); + break; + + case TCP_ST_CLOSE_WAIT: + Handle_TCP_ST_CLOSE_WAIT(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payloadlen, window); + break; + + case TCP_ST_LAST_ACK: + Handle_TCP_ST_LAST_ACK(mtcp, cur_ts, iph, ip_len, cur_stream, tcph, seq, ack_seq, payloadlen, window); + break; + + case TCP_ST_FIN_WAIT_1: + Handle_TCP_ST_FIN_WAIT_1(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payload, payloadlen, window); + break; + + case TCP_ST_FIN_WAIT_2: + Handle_TCP_ST_FIN_WAIT_2(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payload, payloadlen, window); + break; + + case TCP_ST_CLOSING: + Handle_TCP_ST_CLOSING(mtcp, cur_ts, cur_stream, tcph, seq, ack_seq, payloadlen, window); + break; + + case TCP_ST_TIME_WAIT: + /* the only thing that can arrive in this state is a retransmission + of the remote FIN. Acknowledge it, and restart the 2 MSL timeout */ + if (cur_stream->on_timewait_list) { + RemoveFromTimewaitList(mtcp, cur_stream); + AddtoTimewaitList(mtcp, cur_stream, cur_ts); + } + AddtoControlList(mtcp, cur_stream, cur_ts); + break; + + case TCP_ST_CLOSED: + break; + } + + return TRUE; +} diff --git a/lib/flash/mtcp/tcp_out.c b/lib/flash/mtcp/tcp_out.c new file mode 100644 index 0000000..a4bba01 --- /dev/null +++ b/lib/flash/mtcp/tcp_out.c @@ -0,0 +1,1072 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #include +#include "tcp_out.h" +#include "tcp_util.h" +#include "mtcp.h" +#include "ip_out.h" +#include "tcp_in.h" +#include "tcp_stream.h" +#include "eventpoll.h" +#include "timer.h" +#include "debug.h" +#if RATE_LIMIT_ENABLED || PACING_ENABLED +#include "pacing.h" +#endif + +#define TCP_CALCULATE_CHECKSUM TRUE +#define ACK_PIGGYBACK TRUE +#define TRY_SEND_BEFORE_QUEUE FALSE + +#define TCP_MAX_WINDOW 65535 + +/*----------------------------------------------------------------------------*/ +static inline uint16_t CalculateOptionLength(uint8_t flags) +{ + uint16_t optlen = 0; + + if (flags & TCP_FLAG_SYN) { + optlen += TCP_OPT_MSS_LEN; +#if TCP_OPT_SACK_ENABLED + optlen += TCP_OPT_SACK_PERMIT_LEN; +#if !TCP_OPT_TIMESTAMP_ENABLED + optlen += 2; // insert NOP padding +#endif /* TCP_OPT_TIMESTAMP_ENABLED */ +#endif /* TCP_OPT_SACK_ENABLED */ + +#if TCP_OPT_TIMESTAMP_ENABLED + optlen += TCP_OPT_TIMESTAMP_LEN; +#if !TCP_OPT_SACK_ENABLED + optlen += 2; // insert NOP padding +#endif /* TCP_OPT_SACK_ENABLED */ +#endif /* TCP_OPT_TIMESTAMP_ENABLED */ + + optlen += TCP_OPT_WSCALE_LEN + 1; + + } else { +#if TCP_OPT_TIMESTAMP_ENABLED + optlen += TCP_OPT_TIMESTAMP_LEN + 2; +#endif + +#if TCP_OPT_SACK_ENABLED + if (flags & TCP_FLAG_SACK) { + optlen += TCP_OPT_SACK_LEN + 2; + } +#endif + } + + assert(optlen % 4 == 0); + + return optlen; +} +/*----------------------------------------------------------------------------*/ +static inline void GenerateTCPTimestamp(tcp_stream *cur_stream, uint8_t *tcpopt, uint32_t cur_ts) +{ + uint32_t *ts = (uint32_t *)(tcpopt + 2); + + tcpopt[0] = TCP_OPT_TIMESTAMP; + tcpopt[1] = TCP_OPT_TIMESTAMP_LEN; + ts[0] = htonl(cur_ts); + ts[1] = htonl(cur_stream->rcvvar->ts_recent); +} +/*----------------------------------------------------------------------------*/ +static inline void GenerateTCPOptions(tcp_stream *cur_stream, uint32_t cur_ts, uint8_t flags, uint8_t *tcpopt, uint16_t optlen) +{ + (void)optlen; + int i = 0; + + if (flags & TCP_FLAG_SYN) { + uint16_t mss; + + /* MSS option */ + mss = cur_stream->sndvar->mss; + tcpopt[i++] = TCP_OPT_MSS; + tcpopt[i++] = TCP_OPT_MSS_LEN; + tcpopt[i++] = mss >> 8; + tcpopt[i++] = mss % 256; + + /* SACK permit */ +#if TCP_OPT_SACK_ENABLED +#if !TCP_OPT_TIMESTAMP_ENABLED + tcpopt[i++] = TCP_OPT_NOP; + tcpopt[i++] = TCP_OPT_NOP; +#endif /* TCP_OPT_TIMESTAMP_ENABLED */ + tcpopt[i++] = TCP_OPT_SACK_PERMIT; + tcpopt[i++] = TCP_OPT_SACK_PERMIT_LEN; + TRACE_SACK("Local SACK permited.\n"); +#endif /* TCP_OPT_SACK_ENABLED */ + + /* Timestamp */ +#if TCP_OPT_TIMESTAMP_ENABLED +#if !TCP_OPT_SACK_ENABLED + tcpopt[i++] = TCP_OPT_NOP; + tcpopt[i++] = TCP_OPT_NOP; +#endif /* TCP_OPT_SACK_ENABLED */ + GenerateTCPTimestamp(cur_stream, tcpopt + i, cur_ts); + i += TCP_OPT_TIMESTAMP_LEN; +#endif /* TCP_OPT_TIMESTAMP_ENABLED */ + + /* Window scale */ + tcpopt[i++] = TCP_OPT_NOP; + tcpopt[i++] = TCP_OPT_WSCALE; + tcpopt[i++] = TCP_OPT_WSCALE_LEN; + tcpopt[i++] = cur_stream->sndvar->wscale_mine; + + } else { +#if TCP_OPT_TIMESTAMP_ENABLED + tcpopt[i++] = TCP_OPT_NOP; + tcpopt[i++] = TCP_OPT_NOP; + GenerateTCPTimestamp(cur_stream, tcpopt + i, cur_ts); + i += TCP_OPT_TIMESTAMP_LEN; +#endif + +#if TCP_OPT_SACK_ENABLED + if (flags & TCP_OPT_SACK) { + // i += GenerateSACKOption(cur_stream, tcpopt + i); + } +#endif + } + + assert(i == optlen); +} +/*----------------------------------------------------------------------------*/ +int SendTCPPacketStandalone(struct mtcp_manager *mtcp, uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport, uint32_t seq, + uint32_t ack_seq, uint16_t window, uint8_t flags, uint8_t *payload, uint16_t payloadlen, uint32_t cur_ts, + uint32_t echo_ts) +{ + struct tcphdr *tcph; + uint8_t *tcpopt; + uint32_t *ts; + uint16_t optlen; + int rc = -1; + + optlen = CalculateOptionLength(flags); + if (payloadlen + optlen > TCP_DEFAULT_MSS) { + TRACE_ERROR("Payload size exceeds MSS.\n"); + assert(0); + return ERROR; + } + + tcph = (struct tcphdr *)IPOutputStandalone(mtcp, IPPROTO_TCP, 0, saddr, daddr, TCP_HEADER_LEN + optlen + payloadlen); + if (tcph == NULL) { + return ERROR; + } + memset(tcph, 0, TCP_HEADER_LEN + optlen); + + tcph->source = sport; + tcph->dest = dport; + + if (flags & TCP_FLAG_SYN) + tcph->syn = TRUE; + if (flags & TCP_FLAG_FIN) + tcph->fin = TRUE; + if (flags & TCP_FLAG_RST) + tcph->rst = TRUE; + if (flags & TCP_FLAG_PSH) + tcph->psh = TRUE; + + tcph->seq = htonl(seq); + if (flags & TCP_FLAG_ACK) { + tcph->ack = TRUE; + tcph->ack_seq = htonl(ack_seq); + } + + tcph->window = htons(MIN(window, TCP_MAX_WINDOW)); + + tcpopt = (uint8_t *)tcph + TCP_HEADER_LEN; + ts = (uint32_t *)(tcpopt + 4); + + tcpopt[0] = TCP_OPT_NOP; + tcpopt[1] = TCP_OPT_NOP; + tcpopt[2] = TCP_OPT_TIMESTAMP; + tcpopt[3] = TCP_OPT_TIMESTAMP_LEN; + ts[0] = htonl(cur_ts); + ts[1] = htonl(echo_ts); + + tcph->doff = (TCP_HEADER_LEN + optlen) >> 2; + // copy payload if exist + if (payloadlen > 0) { + memcpy((uint8_t *)tcph + TCP_HEADER_LEN + optlen, payload, payloadlen); +#if defined(NETSTAT) && defined(ENABLELRO) + mtcp->nstat.tx_gdptbytes += payloadlen; +#endif /* NETSTAT */ + } + +#if TCP_CALCULATE_CHECKSUM +#ifndef DISABLE_HWCSUM + uint8_t is_external; + if (mtcp->iom->dev_ioctl != NULL) + rc = mtcp->iom->dev_ioctl(mtcp->ctx, GetOutputInterface(daddr, &is_external), PKT_TX_TCPIP_CSUM, NULL); + UNUSED(is_external); +#endif + if (rc == -1) + tcph->check = TCPCalcChecksum((uint16_t *)tcph, TCP_HEADER_LEN + optlen + payloadlen, saddr, daddr); +#endif + + if (tcph->syn || tcph->fin) { + payloadlen++; + } + + return payloadlen; +} +/*----------------------------------------------------------------------------*/ +int SendTCPPacket(struct mtcp_manager *mtcp, tcp_stream *cur_stream, uint32_t cur_ts, uint8_t flags, uint8_t *payload, + uint16_t payloadlen) +{ + struct tcphdr *tcph; + uint16_t optlen; + uint8_t wscale = 0; + uint32_t window32 = 0; + int rc = -1; + + optlen = CalculateOptionLength(flags); + if (payloadlen + optlen > cur_stream->sndvar->mss) { + TRACE_ERROR("Payload size exceeds MSS\n"); + return ERROR; + } + + tcph = (struct tcphdr *)IPOutput(mtcp, cur_stream, TCP_HEADER_LEN + optlen + payloadlen); + if (tcph == NULL) { + return -2; + } + memset(tcph, 0, TCP_HEADER_LEN + optlen); + + tcph->source = cur_stream->sport; + tcph->dest = cur_stream->dport; + + if (flags & TCP_FLAG_SYN) { + tcph->syn = TRUE; + if (cur_stream->snd_nxt != cur_stream->sndvar->iss) { + TRACE_DBG("Stream %d: weird SYN sequence. " + "snd_nxt: %u, iss: %u\n", + cur_stream->id, cur_stream->snd_nxt, cur_stream->sndvar->iss); + } +#if 0 + TRACE_FIN("Stream %d: Sending SYN. seq: %u, ack_seq: %u\n", + cur_stream->id, cur_stream->snd_nxt, cur_stream->rcv_nxt); +#endif + } + if (flags & TCP_FLAG_RST) { + TRACE_FIN("Stream %d: Sending RST.\n", cur_stream->id); + tcph->rst = TRUE; + } + if (flags & TCP_FLAG_PSH) + tcph->psh = TRUE; + + if (flags & TCP_FLAG_WACK) { + tcph->seq = htonl(cur_stream->snd_nxt - 1); + TRACE_CLWND("%u Sending ACK to get new window advertisement. " + "seq: %u, peer_wnd: %u, snd_nxt - snd_una: %u\n", + cur_stream->id, cur_stream->snd_nxt - 1, cur_stream->sndvar->peer_wnd, + cur_stream->snd_nxt - cur_stream->sndvar->snd_una); + } else if (flags & TCP_FLAG_FIN) { + tcph->fin = TRUE; + + if (cur_stream->sndvar->fss == 0) { + TRACE_ERROR("Stream %u: not fss set. closed: %u\n", cur_stream->id, cur_stream->closed); + } + tcph->seq = htonl(cur_stream->sndvar->fss); + cur_stream->sndvar->is_fin_sent = TRUE; + TRACE_FIN("Stream %d: Sending FIN. seq: %u, ack_seq: %u\n", cur_stream->id, cur_stream->snd_nxt, cur_stream->rcv_nxt); + } else { + tcph->seq = htonl(cur_stream->snd_nxt); + } + + if (flags & TCP_FLAG_ACK) { + tcph->ack = TRUE; + tcph->ack_seq = htonl(cur_stream->rcv_nxt); + cur_stream->sndvar->ts_lastack_sent = cur_ts; + cur_stream->last_active_ts = cur_ts; + UpdateTimeoutList(mtcp, cur_stream); + } + + if (flags & TCP_FLAG_SYN) { + wscale = 0; + } else { + wscale = cur_stream->sndvar->wscale_mine; + } + + window32 = cur_stream->rcvvar->rcv_wnd >> wscale; + tcph->window = htons((uint16_t)MIN(window32, TCP_MAX_WINDOW)); + /* if the advertised window is 0, we need to advertise again later */ + if (window32 == 0) { + cur_stream->need_wnd_adv = TRUE; + } + + GenerateTCPOptions(cur_stream, cur_ts, flags, (uint8_t *)tcph + TCP_HEADER_LEN, optlen); + + tcph->doff = (TCP_HEADER_LEN + optlen) >> 2; + // copy payload if exist + if (payloadlen > 0) { + memcpy((uint8_t *)tcph + TCP_HEADER_LEN + optlen, payload, payloadlen); +#if defined(NETSTAT) && defined(ENABLELRO) + mtcp->nstat.tx_gdptbytes += payloadlen; +#endif /* NETSTAT */ + } + +#if TCP_CALCULATE_CHECKSUM +#ifndef DISABLE_HWCSUM + if (mtcp->iom->dev_ioctl != NULL) + rc = mtcp->iom->dev_ioctl(mtcp->ctx, cur_stream->sndvar->nif_out, PKT_TX_TCPIP_CSUM, NULL); +#endif + if (rc == -1) + tcph->check = + TCPCalcChecksum((uint16_t *)tcph, TCP_HEADER_LEN + optlen + payloadlen, cur_stream->saddr, cur_stream->daddr); +#endif + + cur_stream->snd_nxt += payloadlen; + + if (tcph->syn || tcph->fin) { + cur_stream->snd_nxt++; + payloadlen++; + } + + if (payloadlen > 0) { + if (cur_stream->state > TCP_ST_ESTABLISHED) { + TRACE_FIN("Payload after ESTABLISHED: length: %d, snd_nxt: %u\n", payloadlen, cur_stream->snd_nxt); + } + + /* update retransmission timer if have payload */ + cur_stream->sndvar->ts_rto = cur_ts + cur_stream->sndvar->rto; + TRACE_RTO("Updating retransmission timer. " + "cur_ts: %u, rto: %u, ts_rto: %u\n", + cur_ts, cur_stream->sndvar->rto, cur_stream->sndvar->ts_rto); + AddtoRTOList(mtcp, cur_stream); + } + + return payloadlen; +} +/*----------------------------------------------------------------------------*/ +static int FlushTCPSendingBuffer(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts) +{ +#if 0 + struct tcp_send_vars *sndvar = cur_stream->sndvar; + const uint32_t maxlen = sndvar->mss - CalculateOptionLength(TCP_FLAG_ACK); + uint8_t *data; + uint32_t buffered_len; + uint32_t seq; + uint16_t len; + int16_t sndlen; + uint32_t window; + int packets = 0; + uint8_t wack_sent = 0; + + if (!sndvar->sndbuf) { + TRACE_ERROR("Stream %d: No send buffer available.\n", cur_stream->id); + assert(0); + return 0; + } + + SBUF_LOCK(&sndvar->write_lock); + + if (sndvar->sndbuf->len == 0) { + packets = 0; + goto out; + } + + window = MIN(sndvar->cwnd, sndvar->peer_wnd); + + while (1) { + seq = cur_stream->snd_nxt; + + if (TCP_SEQ_LT(seq, sndvar->sndbuf->head_seq)) { + TRACE_ERROR("Stream %d: Invalid sequence to send. " + "state: %s, seq: %u, head_seq: %u.\n", + cur_stream->id, TCPStateToString(cur_stream), + seq, sndvar->sndbuf->head_seq); + assert(0); + break; + } + buffered_len = sndvar->sndbuf->head_seq + sndvar->sndbuf->len - seq; + if (cur_stream->state > TCP_ST_ESTABLISHED) { + TRACE_FIN("head_seq: %u, len: %u, seq: %u, " + "buffered_len: %u\n", sndvar->sndbuf->head_seq, + sndvar->sndbuf->len, seq, buffered_len); + } + if (buffered_len == 0) + break; + + data = sndvar->sndbuf->head + + (seq - sndvar->sndbuf->head_seq); + + if (buffered_len > maxlen) { + len = maxlen; + } else { + len = buffered_len; + } + + if (len > window) + len = window; + + if (len <= 0) + break; + + if (cur_stream->state > TCP_ST_ESTABLISHED) { + TRACE_FIN("Flushing after ESTABLISHED: seq: %u, len: %u, " + "buffered_len: %u\n", seq, len, buffered_len); + } + + if (seq - sndvar->snd_una + len > window) { + /* Ask for new window advertisement to peer */ + if (seq - sndvar->snd_una + len > sndvar->peer_wnd) { +#if 0 + TRACE_CLWND("Full peer window. " + "peer_wnd: %u, (snd_nxt-snd_una): %u\n", + sndvar->peer_wnd, seq - sndvar->snd_una); +#endif + if (!wack_sent && TS_TO_MSEC(cur_ts - sndvar->ts_lastack_sent) > 500) { + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_WACK); + } + else + wack_sent = 1; + } + packets = -3; + goto out; + } + + sndlen = SendTCPPacket(mtcp, cur_stream, cur_ts, + TCP_FLAG_ACK, data, len); + if (sndlen < 0) { + packets = sndlen; + goto out; + } + packets++; + + window -= len; + } + + out: + SBUF_UNLOCK(&sndvar->write_lock); + return packets; +#else + struct tcp_send_vars *sndvar = cur_stream->sndvar; + uint8_t *data; + uint32_t pkt_len; + uint32_t len; + uint32_t seq = 0; + int remaining_window; + int sndlen; + int packets = 0; + uint8_t wack_sent = 0; + + if (!sndvar->sndbuf) { + TRACE_ERROR("Stream %d: No send buffer available.\n", cur_stream->id); + assert(0); + return 0; + } + + SBUF_LOCK(&sndvar->write_lock); + + if (sndvar->sndbuf->len == 0) { + packets = 0; + goto out; + } + + while (1) { +#if USE_CCP + if (sndvar->missing_seq) { + seq = sndvar->missing_seq; + } else { +#endif + seq = cur_stream->snd_nxt; +#if USE_CCP + } +#endif + //seq = cur_stream->snd_nxt; + data = sndvar->sndbuf->head + (seq - sndvar->sndbuf->head_seq); + len = sndvar->sndbuf->len - (seq - sndvar->sndbuf->head_seq); +#if USE_CCP + // Without this, mm continually drops packets (not sure why, bursting?) -> mtcp sees lots of losses -> throughput dies + if (cur_stream->wait_for_acks && TCP_SEQ_GT(cur_stream->snd_nxt, cur_stream->rcvvar->last_ack_seq)) { + goto out; + } +#endif + /* sanity check */ + if (TCP_SEQ_LT(seq, sndvar->sndbuf->head_seq)) { + TRACE_ERROR("Stream %d: Invalid sequence to send. " + "state: %s, seq: %u, head_seq: %u.\n", + cur_stream->id, TCPStateToString(cur_stream), seq, sndvar->sndbuf->head_seq); + assert(0); + break; + } + if (TCP_SEQ_LT(seq, sndvar->snd_una)) { + TRACE_ERROR("Stream %d: Invalid sequence to send. " + "state: %s, seq: %u, snd_una: %u.\n", + cur_stream->id, TCPStateToString(cur_stream), seq, sndvar->snd_una); + assert(0); + break; + } + if (sndvar->sndbuf->len < (seq - sndvar->sndbuf->head_seq)) { + TRACE_ERROR("Stream %d: len < 0\n", cur_stream->id); + assert(0); + break; + } + + /* if there is no buffered data */ + if (len == 0) + break; + +#if TCP_OPT_SACK_ENABLED + if (SeqIsSacked(cur_stream, seq)) { + TRACE_DBG("!! SKIPPING %u\n", seq - sndvar->iss); + cur_stream->snd_nxt += len; + continue; + } +#endif + + remaining_window = MIN(sndvar->cwnd, sndvar->peer_wnd) - (seq - sndvar->snd_una); + /* if there is no space in the window */ + if (remaining_window <= 0 || (remaining_window < sndvar->mss && seq - sndvar->snd_una > 0)) { + /* if peer window is full, send ACK and let its peer advertises new one */ + if (sndvar->peer_wnd <= sndvar->cwnd) { +#if 0 + TRACE_CLWND("Full peer window. " + "peer_wnd: %u, (snd_nxt-snd_una): %u\n", + sndvar->peer_wnd, seq - sndvar->snd_una); +#endif + if (!wack_sent && TS_TO_MSEC(cur_ts - sndvar->ts_lastack_sent) > 500) + EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_WACK); + else + wack_sent = 1; + } + packets = -3; + goto out; + } + + /* payload size limited by remaining window space */ + len = MIN((int)len, remaining_window); + /* payload size limited by TCP MSS */ + pkt_len = MIN((int)len, (int)sndvar->mss - (int)CalculateOptionLength(TCP_FLAG_ACK)); + +#if RATE_LIMIT_ENABLED + // update rate + if (cur_stream->rcvvar->srtt) { + cur_stream->bucket->rate = (uint32_t)(SECONDS_TO_USECS( // bits / s = mbps + BYTES_TO_BITS( // bits / us + (double)sndvar->cwnd / UNSHIFT_SRTT(cur_stream->rcvvar->srtt) // bytes / us + ))); + } + if (cur_stream->bucket->rate != 0 && (SufficientTokens(cur_stream->bucket, pkt_len * 8) < 0)) { + packets = -3; + goto out; + } +#endif + +#if PACING_ENABLED + if (!CanSendNow(cur_stream->pacer)) { + packets = -3; + goto out; + } +#endif + if ((sndlen = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, data, pkt_len)) < 0) { + /* there is no available tx buf */ + packets = -3; + goto out; + } +#if USE_CCP + if (sndvar->missing_seq) { + sndvar->missing_seq = 0; + } +#endif + packets++; + } + +out: + SBUF_UNLOCK(&sndvar->write_lock); + return packets; +#endif +} +/*----------------------------------------------------------------------------*/ +static inline int SendControlPacket(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts) +{ + struct tcp_send_vars *sndvar = cur_stream->sndvar; + int ret = 0; + + if (cur_stream->state == TCP_ST_SYN_SENT) { + /* Send SYN here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_SYN, NULL, 0); + + } else if (cur_stream->state == TCP_ST_SYN_RCVD) { + /* Send SYN/ACK here */ + cur_stream->snd_nxt = sndvar->iss; + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_SYN | TCP_FLAG_ACK, NULL, 0); + + } else if (cur_stream->state == TCP_ST_ESTABLISHED) { + /* Send ACK here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + + } else if (cur_stream->state == TCP_ST_CLOSE_WAIT) { + /* Send ACK for the FIN here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + + } else if (cur_stream->state == TCP_ST_LAST_ACK) { + /* if it is on ack_list, send it after sending ack */ + if (sndvar->on_send_list || sndvar->on_ack_list) { + ret = -1; + } else { + /* Send FIN/ACK here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0); + } + } else if (cur_stream->state == TCP_ST_FIN_WAIT_1) { + /* if it is on ack_list, send it after sending ack */ + if (sndvar->on_send_list || sndvar->on_ack_list) { + ret = -1; + } else { + /* Send FIN/ACK here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0); + } + + } else if (cur_stream->state == TCP_ST_FIN_WAIT_2) { + /* Send ACK here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + + } else if (cur_stream->state == TCP_ST_CLOSING) { + if (sndvar->is_fin_sent) { + /* if the sequence is for FIN, send FIN */ + if (cur_stream->snd_nxt == sndvar->fss) { + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0); + } else { + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + } + } else { + /* if FIN is not sent, send fin with ack */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0); + } + + } else if (cur_stream->state == TCP_ST_TIME_WAIT) { + /* Send ACK here */ + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + + } else if (cur_stream->state == TCP_ST_CLOSED) { + /* Send RST here */ + TRACE_DBG("Stream %d: Try sending RST (TCP_ST_CLOSED)\n", cur_stream->id); + /* first flush the data and ack */ + if (sndvar->on_send_list || sndvar->on_ack_list) { + ret = -1; + } else { + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_RST, NULL, 0); + if (ret >= 0) { + DestroyTCPStream(mtcp, cur_stream); + } + } + } + + return ret; +} +/*----------------------------------------------------------------------------*/ +inline int WriteTCPControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh) +{ + tcp_stream *cur_stream; + tcp_stream *next, *last; + int cnt = 0; + int ret; + + thresh = MIN(thresh, sender->control_list_cnt); + + /* Send TCP control messages */ + cnt = 0; + cur_stream = TAILQ_FIRST(&sender->control_list); + last = TAILQ_LAST(&sender->control_list, control_head); + while (cur_stream) { + if (++cnt > thresh) + break; + + TRACE_LOOP("Inside control loop. cnt: %u, stream: %d\n", cnt, cur_stream->id); + next = TAILQ_NEXT(cur_stream, sndvar->control_link); + + TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link); + sender->control_list_cnt--; + + if (cur_stream->sndvar->on_control_list) { + cur_stream->sndvar->on_control_list = FALSE; + //TRACE_DBG("Stream %u: Sending control packet\n", cur_stream->id); + ret = SendControlPacket(mtcp, cur_stream, cur_ts); + if (ret == -2) { + TAILQ_INSERT_HEAD(&sender->control_list, cur_stream, sndvar->control_link); + cur_stream->sndvar->on_control_list = TRUE; + sender->control_list_cnt++; + /* since there is no available write buffer, break */ + break; + } else if (ret < 0) { + /* try again after handling other streams */ + TAILQ_INSERT_TAIL(&sender->control_list, cur_stream, sndvar->control_link); + cur_stream->sndvar->on_control_list = TRUE; + sender->control_list_cnt++; + } + } else { + TRACE_ERROR("Stream %d: not on control list.\n", cur_stream->id); + } + + if (cur_stream == last) + break; + cur_stream = next; + } + + return cnt; +} +/*----------------------------------------------------------------------------*/ +inline int WriteTCPDataList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh) +{ + tcp_stream *cur_stream; + tcp_stream *next, *last; + int cnt = 0; + int ret; + + /* Send data */ + cnt = 0; + cur_stream = TAILQ_FIRST(&sender->send_list); + last = TAILQ_LAST(&sender->send_list, send_head); + while (cur_stream) { + if (++cnt > thresh) + break; + + TRACE_LOOP("Inside send loop. cnt: %u, stream: %d\n", cnt, cur_stream->id); + next = TAILQ_NEXT(cur_stream, sndvar->send_link); + + TAILQ_REMOVE(&sender->send_list, cur_stream, sndvar->send_link); + if (cur_stream->sndvar->on_send_list) { + ret = 0; + + /* Send data here */ + /* Only can send data when ESTABLISHED or CLOSE_WAIT */ + if (cur_stream->state == TCP_ST_ESTABLISHED) { + if (cur_stream->sndvar->on_control_list) { + /* delay sending data after until on_control_list becomes off */ + //TRACE_DBG("Stream %u: delay sending data.\n", cur_stream->id); + ret = -1; + } else { + ret = FlushTCPSendingBuffer(mtcp, cur_stream, cur_ts); + } + } else if (cur_stream->state == TCP_ST_CLOSE_WAIT || cur_stream->state == TCP_ST_FIN_WAIT_1 || + cur_stream->state == TCP_ST_LAST_ACK) { + ret = FlushTCPSendingBuffer(mtcp, cur_stream, cur_ts); + } else { + TRACE_DBG("Stream %d: on_send_list at state %s\n", cur_stream->id, TCPStateToString(cur_stream)); +#if defined(DUMP_STREAM) + DumpStream(mtcp, cur_stream); +#endif + } + + if (ret < 0) { + TAILQ_INSERT_TAIL(&sender->send_list, cur_stream, sndvar->send_link); + /* since there is no available write buffer, break */ + break; + + } else { + cur_stream->sndvar->on_send_list = FALSE; + sender->send_list_cnt--; + /* the ret value is the number of packets sent. */ + /* decrease ack_cnt for the piggybacked acks */ +#if ACK_PIGGYBACK + if (cur_stream->sndvar->ack_cnt > 0) { + if (cur_stream->sndvar->ack_cnt > ret) { + cur_stream->sndvar->ack_cnt -= ret; + } else { + cur_stream->sndvar->ack_cnt = 0; + } + } +#endif +#if 1 + if (cur_stream->control_list_waiting) { + if (!cur_stream->sndvar->on_ack_list) { + cur_stream->control_list_waiting = FALSE; + AddtoControlList(mtcp, cur_stream, cur_ts); + } + } +#endif + } + } else { + TRACE_ERROR("Stream %d: not on send list.\n", cur_stream->id); +#ifdef DUMP_STREAM + DumpStream(mtcp, cur_stream); +#endif + } + + if (cur_stream == last) + break; + cur_stream = next; + } + + return cnt; +} +/*----------------------------------------------------------------------------*/ +inline int WriteTCPACKList(mtcp_manager_t mtcp, struct mtcp_sender *sender, uint32_t cur_ts, int thresh) +{ + tcp_stream *cur_stream; + tcp_stream *next, *last; + int to_ack; + int cnt = 0; + int ret; + + /* Send aggregated acks */ + cnt = 0; + cur_stream = TAILQ_FIRST(&sender->ack_list); + last = TAILQ_LAST(&sender->ack_list, ack_head); + while (cur_stream) { + if (++cnt > thresh) + break; + + TRACE_LOOP("Inside ack loop. cnt: %u\n", cnt); + next = TAILQ_NEXT(cur_stream, sndvar->ack_link); + + if (cur_stream->sndvar->on_ack_list) { + /* this list is only to ack the data packets */ + /* if the ack is not data ack, then it will not process here */ + to_ack = FALSE; + if (cur_stream->state == TCP_ST_ESTABLISHED || cur_stream->state == TCP_ST_CLOSE_WAIT || + cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_FIN_WAIT_2 || + cur_stream->state == TCP_ST_TIME_WAIT) { + /* TIMEWAIT is possible since the ack is queued + at FIN_WAIT_2 */ + if (cur_stream->rcvvar->rcvbuf) { + if (TCP_SEQ_LEQ(cur_stream->rcv_nxt, cur_stream->rcvvar->rcvbuf->head_seq + + cur_stream->rcvvar->rcvbuf->merged_len)) { + to_ack = TRUE; + } + } + } else { + TRACE_DBG("Stream %u (%s): " + "Try sending ack at not proper state. " + "seq: %u, ack_seq: %u, on_control_list: %u\n", + cur_stream->id, TCPStateToString(cur_stream), cur_stream->snd_nxt, cur_stream->rcv_nxt, + cur_stream->sndvar->on_control_list); +#ifdef DUMP_STREAM + DumpStream(mtcp, cur_stream); +#endif + } + + if (to_ack) { + /* send the queued ack packets */ + while (cur_stream->sndvar->ack_cnt > 0) { + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0); + if (ret < 0) { + /* since there is no available write buffer, break */ + break; + } + cur_stream->sndvar->ack_cnt--; + } + + /* if is_wack is set, send packet to get window advertisement */ + if (cur_stream->sndvar->is_wack) { + cur_stream->sndvar->is_wack = FALSE; + ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK | TCP_FLAG_WACK, NULL, 0); + if (ret < 0) { + /* since there is no available write buffer, break */ + cur_stream->sndvar->is_wack = TRUE; + } + } + + if (!(cur_stream->sndvar->ack_cnt || cur_stream->sndvar->is_wack)) { + cur_stream->sndvar->on_ack_list = FALSE; + TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link); + sender->ack_list_cnt--; + } + } else { + cur_stream->sndvar->on_ack_list = FALSE; + cur_stream->sndvar->ack_cnt = 0; + cur_stream->sndvar->is_wack = 0; + TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link); + sender->ack_list_cnt--; + } + + if (cur_stream->control_list_waiting) { + if (!cur_stream->sndvar->on_send_list) { + cur_stream->control_list_waiting = FALSE; + AddtoControlList(mtcp, cur_stream, cur_ts); + } + } + } else { + TRACE_ERROR("Stream %d: not on ack list.\n", cur_stream->id); + TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link); + sender->ack_list_cnt--; +#ifdef DUMP_STREAM + thread_printf(mtcp, mtcp->log_fp, "Stream %u: not on ack list.\n", cur_stream->id); + DumpStream(mtcp, cur_stream); +#endif + } + + if (cur_stream == last) + break; + cur_stream = next; + } + + return cnt; +} +/*----------------------------------------------------------------------------*/ +inline struct mtcp_sender *GetSender(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (cur_stream->sndvar->nif_out < 0) { + return mtcp->g_sender; + } + + int eidx = CONFIG.nif_to_eidx[cur_stream->sndvar->nif_out]; + if (eidx < 0 || eidx >= CONFIG.eths_num) { + TRACE_ERROR("(NEVER HAPPEN) Failed to find appropriate sender.\n"); + return NULL; + } + + return mtcp->n_sender[eidx]; +} +/*----------------------------------------------------------------------------*/ +inline void AddtoControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts) +{ + (void)cur_ts; +#if TRY_SEND_BEFORE_QUEUE + int ret; + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + ret = SendControlPacket(mtcp, cur_stream, cur_ts); + if (ret < 0) { +#endif + if (!cur_stream->sndvar->on_control_list) { + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + cur_stream->sndvar->on_control_list = TRUE; + TAILQ_INSERT_TAIL(&sender->control_list, cur_stream, sndvar->control_link); + sender->control_list_cnt++; + //TRACE_DBG("Stream %u: added to control list (cnt: %d)\n", + // cur_stream->id, sender->control_list_cnt); + } +#if TRY_SEND_BEFORE_QUEUE + } else { + if (cur_stream->sndvar->on_control_list) { + cur_stream->sndvar->on_control_list = FALSE; + TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link); + sender->control_list_cnt--; + } + } +#endif +} +/*----------------------------------------------------------------------------*/ +inline void AddtoSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + if (!cur_stream->sndvar->sndbuf) { + TRACE_ERROR("[%d] Stream %d: No send buffer available.\n", mtcp->ctx->cpu, cur_stream->id); + assert(0); + return; + } + + if (!cur_stream->sndvar->on_send_list) { + cur_stream->sndvar->on_send_list = TRUE; + TAILQ_INSERT_TAIL(&sender->send_list, cur_stream, sndvar->send_link); + sender->send_list_cnt++; + } +} +/*----------------------------------------------------------------------------*/ +inline void AddtoACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + if (!cur_stream->sndvar->on_ack_list) { + cur_stream->sndvar->on_ack_list = TRUE; + TAILQ_INSERT_TAIL(&sender->ack_list, cur_stream, sndvar->ack_link); + sender->ack_list_cnt++; + } +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + if (cur_stream->sndvar->on_control_list) { + cur_stream->sndvar->on_control_list = FALSE; + TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link); + sender->control_list_cnt--; + //TRACE_DBG("Stream %u: Removed from control list (cnt: %d)\n", + // cur_stream->id, sender->control_list_cnt); + } +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + if (cur_stream->sndvar->on_send_list) { + cur_stream->sndvar->on_send_list = FALSE; + TAILQ_REMOVE(&sender->send_list, cur_stream, sndvar->send_link); + sender->send_list_cnt--; + } +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + struct mtcp_sender *sender = GetSender(mtcp, cur_stream); + assert(sender != NULL); + + if (cur_stream->sndvar->on_ack_list) { + cur_stream->sndvar->on_ack_list = FALSE; + TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link); + sender->ack_list_cnt--; + } +} +/*----------------------------------------------------------------------------*/ +inline void EnqueueACK(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts, uint8_t opt) +{ + (void)cur_ts; + if (!(cur_stream->state == TCP_ST_ESTABLISHED || cur_stream->state == TCP_ST_CLOSE_WAIT || + cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_FIN_WAIT_2)) { + TRACE_DBG("Stream %u: Enqueueing ack at state %s\n", cur_stream->id, TCPStateToString(cur_stream)); + } + + if (opt == ACK_OPT_NOW) { + if (cur_stream->sndvar->ack_cnt < cur_stream->sndvar->ack_cnt + 1) { + cur_stream->sndvar->ack_cnt++; + } + } else if (opt == ACK_OPT_AGGREGATE) { + if (cur_stream->sndvar->ack_cnt == 0) { + cur_stream->sndvar->ack_cnt = 1; + } + } else if (opt == ACK_OPT_WACK) { + cur_stream->sndvar->is_wack = TRUE; + } + AddtoACKList(mtcp, cur_stream); +} +/*----------------------------------------------------------------------------*/ +inline void DumpControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender) +{ + (void)mtcp; + tcp_stream *stream; + + TRACE_DBG("Dumping control list (count: %d):\n", sender->control_list_cnt); + TAILQ_FOREACH(stream, &sender->control_list, sndvar->control_link) + { + TRACE_DBG("Stream id: %u in control list\n", stream->id); + } +} diff --git a/lib/flash/mtcp/tcp_rb_frag_queue.c b/lib/flash/mtcp/tcp_rb_frag_queue.c new file mode 100644 index 0000000..76aabd1 --- /dev/null +++ b/lib/flash/mtcp/tcp_rb_frag_queue.c @@ -0,0 +1,123 @@ +/* + * TCP free fragment queue for ring buffer - tcp_rb_frag_queue.c/h + * + * EunYoung Jeong + * + * Part of this code borrows Click's simple queue implementation + * + * ============================== Click License ============================= + * + * Copyright (c) 1999-2000 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, subject to the conditions + * listed in the Click LICENSE file. These conditions include: you must + * preserve this copyright notice, and you cannot mention the copyright + * holders in advertising related to the Software without their permission. + * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This + * notice is a summary of the Click LICENSE file; the license in that file is + * legally binding. + */ + +#include "tcp_rb_frag_queue.h" +#include "debug.h" + +/*----------------------------------------------------------------------------*/ +#ifndef _INDEX_TYPE_ +#define _INDEX_TYPE_ +typedef uint32_t index_type; +typedef int32_t signed_index_type; +#endif +/*---------------------------------------------------------------------------*/ +struct rb_frag_queue { + index_type _capacity; + volatile index_type _head; + volatile index_type _tail; + + struct fragment_ctx *volatile *_q; +}; +/*----------------------------------------------------------------------------*/ +static inline index_type NextIndex(rb_frag_queue_t rb_fragq, index_type i) +{ + return (i != rb_fragq->_capacity ? i + 1 : 0); +} +/*---------------------------------------------------------------------------*/ +static inline index_type PrevIndex(rb_frag_queue_t rb_fragq, index_type i) +{ + return (i != 0 ? i - 1 : rb_fragq->_capacity); +} +/*---------------------------------------------------------------------------*/ +static inline void RBFragMemoryBarrier(struct fragment_ctx *volatile frag, volatile index_type index) +{ + __asm__ volatile("" : : "m"(frag), "m"(index)); +} +/*---------------------------------------------------------------------------*/ +rb_frag_queue_t CreateRBFragQueue(int capacity) +{ + rb_frag_queue_t rb_fragq; + + rb_fragq = (rb_frag_queue_t)calloc(1, sizeof(struct rb_frag_queue)); + if (!rb_fragq) + return NULL; + + rb_fragq->_q = (struct fragment_ctx **)calloc(capacity + 1, sizeof(struct fragment_ctx *)); + if (!rb_fragq->_q) { + free(rb_fragq); + return NULL; + } + + rb_fragq->_capacity = capacity; + rb_fragq->_head = rb_fragq->_tail = 0; + + return rb_fragq; +} +/*---------------------------------------------------------------------------*/ +void DestroyRBFragQueue(rb_frag_queue_t rb_fragq) +{ + if (!rb_fragq) + return; + + if (rb_fragq->_q) { + void *q = (void *)(uintptr_t)rb_fragq->_q; + free(q); + rb_fragq->_q = NULL; + } + + free(rb_fragq); +} +/*---------------------------------------------------------------------------*/ +int RBFragEnqueue(rb_frag_queue_t rb_fragq, struct fragment_ctx *frag) +{ + index_type h = rb_fragq->_head; + index_type t = rb_fragq->_tail; + index_type nt = NextIndex(rb_fragq, t); + + if (nt != h) { + rb_fragq->_q[t] = frag; + RBFragMemoryBarrier(rb_fragq->_q[t], rb_fragq->_tail); + rb_fragq->_tail = nt; + return 0; + } + + TRACE_ERROR("Exceed capacity of frag queue!\n"); + return -1; +} +/*---------------------------------------------------------------------------*/ +struct fragment_ctx *RBFragDequeue(rb_frag_queue_t rb_fragq) +{ + index_type h = rb_fragq->_head; + index_type t = rb_fragq->_tail; + + if (h != t) { + struct fragment_ctx *frag = rb_fragq->_q[h]; + RBFragMemoryBarrier(rb_fragq->_q[h], rb_fragq->_head); + rb_fragq->_head = NextIndex(rb_fragq, h); + assert(frag); + + return frag; + } + + return NULL; +} +/*---------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/tcp_ring_buffer.c b/lib/flash/mtcp/tcp_ring_buffer.c new file mode 100644 index 0000000..0089eb3 --- /dev/null +++ b/lib/flash/mtcp/tcp_ring_buffer.c @@ -0,0 +1,432 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "tcp_ring_buffer.h" +#include "tcp_rb_frag_queue.h" +#include "memory_mgt.h" +#include "debug.h" + +#define MAX_RB_SIZE (16 * 1024 * 1024) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#ifdef ENABLELRO +#define __MEMCPY_DATA_2_BUFFER \ + mtcp_manager_t mtcp = rbm->mtcp; \ + if (mtcp->iom == &dpdk_module_func && len > TCP_DEFAULT_MSS) \ + mtcp->iom->dev_ioctl(mtcp->ctx, 0, PKT_RX_TCP_LROSEG, buff->head + putx); \ + else \ + memcpy(buff->head + putx, data, len); +#endif +/*----------------------------------------------------------------------------*/ +struct rb_manager { + size_t chunk_size; + uint32_t cur_num; + uint32_t cnum; + + mem_pool_t mp; + mem_pool_t frag_mp; + + rb_frag_queue_t free_fragq; /* free fragment queue (for app thread) */ + rb_frag_queue_t free_fragq_int; /* free fragment quuee (only for mtcp) */ +#ifdef ENABLELRO + mtcp_manager_t mtcp; +#endif +} rb_manager; +/*----------------------------------------------------------------------------*/ +uint32_t RBGetCurnum(rb_manager_t rbm) +{ + return rbm->cur_num; +} +/*-----------------------------------------------------------------------------*/ +void RBPrintInfo(struct tcp_ring_buffer *buff) +{ + printf("buff_data %p, buff_size %d, buff_mlen %d, " + "buff_clen %lu, buff_head %p (%d), buff_tail (%d)\n", + buff->data, buff->size, buff->merged_len, buff->cum_len, buff->head, buff->head_offset, buff->tail_offset); +} +/*----------------------------------------------------------------------------*/ +void RBPrintStr(struct tcp_ring_buffer *buff) +{ + RBPrintInfo(buff); + printf("%s\n", buff->head); +} +/*----------------------------------------------------------------------------*/ +void RBPrintHex(struct tcp_ring_buffer *buff) +{ + int i; + + RBPrintInfo(buff); + + for (i = 0; i < buff->merged_len; i++) { + if (i != 0 && i % 16 == 0) + printf("\n"); + printf("%0x ", *((unsigned char *)buff->head + i)); + } + printf("\n"); +} +/*----------------------------------------------------------------------------*/ +rb_manager_t RBManagerCreate(mtcp_manager_t mtcp, size_t chunk_size, uint32_t cnum) +{ + (void)mtcp; + rb_manager_t rbm = (rb_manager_t)calloc(1, sizeof(rb_manager)); + + if (!rbm) { + perror("rbm_create calloc"); + return NULL; + } + + rbm->chunk_size = chunk_size; + rbm->cnum = cnum; +#if !defined(DISABLE_DPDK) && !defined(ENABLE_ONVM) + char pool_name[RTE_MEMPOOL_NAMESIZE]; + sprintf(pool_name, "rbm_pool_%u", mtcp->ctx->cpu); + rbm->mp = (mem_pool_t)MPCreate(pool_name, chunk_size, (uint64_t)chunk_size * cnum); +#else + rbm->mp = (mem_pool_t)MPCreate(chunk_size, (uint64_t)chunk_size * cnum); +#endif + if (!rbm->mp) { + TRACE_ERROR("Failed to allocate mp pool.\n"); + free(rbm); + return NULL; + } +#if !defined(DISABLE_DPDK) && !defined(ENABLE_ONVM) + sprintf(pool_name, "frag_mp_%u", mtcp->ctx->cpu); + rbm->frag_mp = (mem_pool_t)MPCreate(pool_name, sizeof(struct fragment_ctx), sizeof(struct fragment_ctx) * cnum); +#else + rbm->frag_mp = (mem_pool_t)MPCreate(sizeof(struct fragment_ctx), sizeof(struct fragment_ctx) * cnum); +#endif + if (!rbm->frag_mp) { + TRACE_ERROR("Failed to allocate frag_mp pool.\n"); + MPDestroy(rbm->mp); + free(rbm); + return NULL; + } + + rbm->free_fragq = CreateRBFragQueue(cnum); + if (!rbm->free_fragq) { + TRACE_ERROR("Failed to create free fragment queue.\n"); + MPDestroy(rbm->mp); + MPDestroy(rbm->frag_mp); + free(rbm); + return NULL; + } + rbm->free_fragq_int = CreateRBFragQueue(cnum); + if (!rbm->free_fragq_int) { + TRACE_ERROR("Failed to create internal free fragment queue.\n"); + MPDestroy(rbm->mp); + MPDestroy(rbm->frag_mp); + DestroyRBFragQueue(rbm->free_fragq); + free(rbm); + return NULL; + } + +#ifdef ENABLELRO + rbm->mtcp = mtcp; +#endif + return rbm; +} +/*----------------------------------------------------------------------------*/ +static inline void FreeFragmentContextSingle(rb_manager_t rbm, struct fragment_ctx *frag) +{ + if (frag->is_calloc) + free(frag); + else + MPFreeChunk(rbm->frag_mp, frag); +} +/*----------------------------------------------------------------------------*/ +static void FreeFragmentContext(rb_manager_t rbm, struct fragment_ctx *fctx) +{ + struct fragment_ctx *remove; + + assert(fctx); + if (fctx == NULL) + return; + + while (fctx) { + remove = fctx; + fctx = fctx->next; + FreeFragmentContextSingle(rbm, remove); + } +} +/*----------------------------------------------------------------------------*/ +static struct fragment_ctx *AllocateFragmentContext(rb_manager_t rbm) +{ + /* this function should be called only in mtcp thread */ + struct fragment_ctx *frag; + + /* first try deqeue the fragment in free fragment queue */ + frag = RBFragDequeue(rbm->free_fragq); + if (!frag) { + frag = RBFragDequeue(rbm->free_fragq_int); + if (!frag) { + /* next fall back to fetching from mempool */ + frag = MPAllocateChunk(rbm->frag_mp); + if (!frag) { + TRACE_ERROR("fragments depleted, fall back to calloc\n"); + frag = calloc(1, sizeof(struct fragment_ctx)); + if (frag == NULL) { + TRACE_ERROR("calloc failed\n"); + exit(-1); + } + frag->is_calloc = 1; /* mark it as allocated by calloc */ + } + } + } + memset(frag, 0, sizeof(*frag)); + return frag; +} +/*----------------------------------------------------------------------------*/ +struct tcp_ring_buffer *RBInit(rb_manager_t rbm, uint32_t init_seq) +{ + struct tcp_ring_buffer *buff = (struct tcp_ring_buffer *)calloc(1, sizeof(struct tcp_ring_buffer)); + + if (buff == NULL) { + perror("rb_init buff"); + return NULL; + } + + buff->data = MPAllocateChunk(rbm->mp); + if (!buff->data) { + perror("rb_init MPAllocateChunk"); + free(buff); + return NULL; + } + + //memset(buff->data, 0, rbm->chunk_size); + + buff->size = rbm->chunk_size; + buff->head = buff->data; + buff->head_seq = init_seq; + buff->init_seq = init_seq; + + rbm->cur_num++; + + return buff; +} +/*----------------------------------------------------------------------------*/ +void RBFree(rb_manager_t rbm, struct tcp_ring_buffer *buff) +{ + assert(buff); + if (buff->fctx) { + FreeFragmentContext(rbm, buff->fctx); + buff->fctx = NULL; + } + + if (buff->data) { + MPFreeChunk(rbm->mp, buff->data); + } + + rbm->cur_num--; + + free(buff); +} +/*----------------------------------------------------------------------------*/ +#define MAXSEQ ((uint32_t)(0xFFFFFFFF)) +/*----------------------------------------------------------------------------*/ +static inline uint32_t GetMinSeq(uint32_t a, uint32_t b) +{ + if (a == b) + return a; + if (a < b) + return ((b - a) <= MAXSEQ / 2) ? a : b; + /* b < a */ + return ((a - b) <= MAXSEQ / 2) ? b : a; +} +/*----------------------------------------------------------------------------*/ +static inline uint32_t GetMaxSeq(uint32_t a, uint32_t b) +{ + if (a == b) + return a; + if (a < b) + return ((b - a) <= MAXSEQ / 2) ? b : a; + /* b < a */ + return ((a - b) <= MAXSEQ / 2) ? a : b; +} +/*----------------------------------------------------------------------------*/ +static inline int CanMerge(const struct fragment_ctx *a, const struct fragment_ctx *b) +{ + uint32_t a_end = a->seq + a->len + 1; + uint32_t b_end = b->seq + b->len + 1; + + if (GetMinSeq(a_end, b->seq) == a_end || GetMinSeq(b_end, a->seq) == b_end) + return 0; + return (1); +} +/*----------------------------------------------------------------------------*/ +static inline void MergeFragments(struct fragment_ctx *a, struct fragment_ctx *b) +{ + /* merge a into b */ + uint32_t min_seq, max_seq; + + min_seq = GetMinSeq(a->seq, b->seq); + max_seq = GetMaxSeq(a->seq + a->len, b->seq + b->len); + b->seq = min_seq; + b->len = max_seq - min_seq; +} +/*----------------------------------------------------------------------------*/ +int RBPut(rb_manager_t rbm, struct tcp_ring_buffer *buff, void *data, uint32_t len, uint32_t cur_seq) +{ + int putx, end_off; + struct fragment_ctx *new_ctx; + struct fragment_ctx *iter; + struct fragment_ctx *prev, *pprev; + int merged = 0; + + if (len <= 0) + return 0; + + // if data offset is smaller than head sequence, then drop + if (GetMinSeq(buff->head_seq, cur_seq) != buff->head_seq) + return 0; + + putx = cur_seq - buff->head_seq; + end_off = putx + len; + if (buff->size < end_off) { + return -2; + } + + // if buffer is at tail, move the data to the first of head + if (buff->size <= ((int)buff->head_offset + end_off)) { + memmove(buff->data, buff->head, buff->last_len); + buff->tail_offset -= buff->head_offset; + buff->head_offset = 0; + buff->head = buff->data; + } +#ifdef ENABLELRO + // copy data to buffer + __MEMCPY_DATA_2_BUFFER; +#else + //copy data to buffer + memcpy(buff->head + putx, data, len); +#endif + if (buff->tail_offset < buff->head_offset + end_off) + buff->tail_offset = buff->head_offset + end_off; + buff->last_len = buff->tail_offset - buff->head_offset; + + // create fragmentation context blocks + new_ctx = AllocateFragmentContext(rbm); + if (!new_ctx) { + perror("allocating new_ctx failed"); + return 0; + } + new_ctx->seq = cur_seq; + new_ctx->len = len; + new_ctx->next = NULL; + + // traverse the fragment list, and merge the new fragment if possible + for (iter = buff->fctx, prev = NULL, pprev = NULL; iter != NULL; pprev = prev, prev = iter, iter = iter->next) { + if (CanMerge(new_ctx, iter)) { + /* merge the first fragment into the second fragment */ + MergeFragments(new_ctx, iter); + + /* remove the first fragment */ + if (prev == new_ctx) { + if (pprev) + pprev->next = iter; + else + buff->fctx = iter; + prev = pprev; + } + FreeFragmentContextSingle(rbm, new_ctx); + new_ctx = iter; + merged = 1; + } else if (merged || GetMaxSeq(cur_seq + len, iter->seq) == iter->seq) { + /* merged at some point, but no more mergeable + then stop it now */ + break; + } + } + + if (!merged) { + if (buff->fctx == NULL) { + buff->fctx = new_ctx; + } else if (GetMinSeq(cur_seq, buff->fctx->seq) == cur_seq) { + /* if the new packet's seqnum is before the existing fragments */ + new_ctx->next = buff->fctx; + buff->fctx = new_ctx; + } else { + /* if the seqnum is in-between the fragments or + at the last */ + assert(GetMinSeq(cur_seq, prev->seq + prev->len) == prev->seq + prev->len); + prev->next = new_ctx; + new_ctx->next = iter; + } + } + if (buff->head_seq == buff->fctx->seq) { + buff->cum_len += buff->fctx->len - buff->merged_len; + buff->merged_len = buff->fctx->len; + } + + return len; +} +/*----------------------------------------------------------------------------*/ +size_t RBRemove(rb_manager_t rbm, struct tcp_ring_buffer *buff, size_t len, int option) +{ + /* this function should be called only in application thread */ + + if (buff->merged_len < (int)len) + len = buff->merged_len; + + if (len == 0) + return 0; + + buff->head_offset += len; + buff->head = buff->data + buff->head_offset; + buff->head_seq += len; + + buff->merged_len -= len; + buff->last_len -= len; + + // modify fragementation chunks + if (len == buff->fctx->len) { + struct fragment_ctx *remove = buff->fctx; + buff->fctx = buff->fctx->next; + if (option == AT_APP) { + RBFragEnqueue(rbm->free_fragq, remove); + } else if (option == AT_MTCP) { + RBFragEnqueue(rbm->free_fragq_int, remove); + } + } else if (len < buff->fctx->len) { + buff->fctx->seq += len; + buff->fctx->len -= len; + } else { + assert(0); + } + + return len; +} +/*----------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/tcp_sb_queue.c b/lib/flash/mtcp/tcp_sb_queue.c new file mode 100644 index 0000000..c19dbb8 --- /dev/null +++ b/lib/flash/mtcp/tcp_sb_queue.c @@ -0,0 +1,123 @@ +/* + * TCP free send buffer queue - tcp_sb_queue.c/h + * + * EunYoung Jeong + * + * Part of this code borrows Click's simple queue implementation + * + * ============================== Click License ============================= + * + * Copyright (c) 1999-2000 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, subject to the conditions + * listed in the Click LICENSE file. These conditions include: you must + * preserve this copyright notice, and you cannot mention the copyright + * holders in advertising related to the Software without their permission. + * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This + * notice is a summary of the Click LICENSE file; the license in that file is + * legally binding. + */ + +#include "tcp_sb_queue.h" +#include "debug.h" + +/*----------------------------------------------------------------------------*/ +#ifndef _INDEX_TYPE_ +#define _INDEX_TYPE_ +typedef uint32_t index_type; +typedef int32_t signed_index_type; +#endif +/*---------------------------------------------------------------------------*/ +struct sb_queue { + index_type _capacity; + volatile index_type _head; + volatile index_type _tail; + + struct tcp_send_buffer *volatile *_q; +}; +/*----------------------------------------------------------------------------*/ +static inline index_type NextIndex(sb_queue_t sq, index_type i) +{ + return (i != sq->_capacity ? i + 1 : 0); +} +/*---------------------------------------------------------------------------*/ +static inline index_type PrevIndex(sb_queue_t sq, index_type i) +{ + return (i != 0 ? i - 1 : sq->_capacity); +} +/*---------------------------------------------------------------------------*/ +static inline void SBMemoryBarrier(struct tcp_send_buffer *volatile buf, volatile index_type index) +{ + __asm__ volatile("" : : "m"(buf), "m"(index)); +} +/*---------------------------------------------------------------------------*/ +sb_queue_t CreateSBQueue(int capacity) +{ + sb_queue_t sq; + + sq = (sb_queue_t)calloc(1, sizeof(struct sb_queue)); + if (!sq) + return NULL; + + sq->_q = (struct tcp_send_buffer **)calloc(capacity + 1, sizeof(struct tcp_send_buffer *)); + if (!sq->_q) { + free(sq); + return NULL; + } + + sq->_capacity = capacity; + sq->_head = sq->_tail = 0; + + return sq; +} +/*---------------------------------------------------------------------------*/ +void DestroySBQueue(sb_queue_t sq) +{ + if (!sq) + return; + + if (sq->_q) { + void *q = (void *)(uintptr_t)sq->_q; + free(q); + sq->_q = NULL; + } + + free(sq); +} +/*---------------------------------------------------------------------------*/ +int SBEnqueue(sb_queue_t sq, struct tcp_send_buffer *buf) +{ + index_type h = sq->_head; + index_type t = sq->_tail; + index_type nt = NextIndex(sq, t); + + if (nt != h) { + sq->_q[t] = buf; + SBMemoryBarrier(sq->_q[t], sq->_tail); + sq->_tail = nt; + return 0; + } + + TRACE_ERROR("Exceed capacity of buf queue!\n"); + return -1; +} +/*---------------------------------------------------------------------------*/ +struct tcp_send_buffer *SBDequeue(sb_queue_t sq) +{ + index_type h = sq->_head; + index_type t = sq->_tail; + + if (h != t) { + struct tcp_send_buffer *buf = sq->_q[h]; + SBMemoryBarrier(sq->_q[h], sq->_head); + sq->_head = NextIndex(sq, h); + assert(buf); + + return buf; + } + + return NULL; +} +/*---------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/tcp_send_buffer.c b/lib/flash/mtcp/tcp_send_buffer.c new file mode 100644 index 0000000..5092119 --- /dev/null +++ b/lib/flash/mtcp/tcp_send_buffer.c @@ -0,0 +1,207 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "memory_mgt.h" +#include "debug.h" +#include "tcp_send_buffer.h" +#include "tcp_sb_queue.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/*----------------------------------------------------------------------------*/ +struct sb_manager { + size_t chunk_size; + uint32_t cur_num; + uint32_t cnum; + mem_pool_t mp; + sb_queue_t freeq; + +} sb_manager; +/*----------------------------------------------------------------------------*/ +uint32_t SBGetCurnum(sb_manager_t sbm) +{ + return sbm->cur_num; +} +/*----------------------------------------------------------------------------*/ +sb_manager_t SBManagerCreate(mtcp_manager_t mtcp, size_t chunk_size, uint32_t cnum) +{ + (void)mtcp; + sb_manager_t sbm = (sb_manager_t)calloc(1, sizeof(sb_manager)); + if (!sbm) { + TRACE_ERROR("SBManagerCreate() failed. %s\n", strerror(errno)); + return NULL; + } + + sbm->chunk_size = chunk_size; + sbm->cnum = cnum; +#if !defined(DISABLE_DPDK) && !defined(ENABLE_ONVM) + char pool_name[RTE_MEMPOOL_NAMESIZE]; + sprintf(pool_name, "sbm_pool_%d", mtcp->ctx->cpu); + sbm->mp = (mem_pool_t)MPCreate(pool_name, chunk_size, (uint64_t)chunk_size * cnum); +#else + sbm->mp = (mem_pool_t)MPCreate(chunk_size, (uint64_t)chunk_size * cnum); +#endif + if (!sbm->mp) { + TRACE_ERROR("Failed to create mem pool for sb.\n"); + free(sbm); + return NULL; + } + + sbm->freeq = CreateSBQueue(cnum); + if (!sbm->freeq) { + TRACE_ERROR("Failed to create free buffer queue.\n"); + MPDestroy(sbm->mp); + free(sbm); + return NULL; + } + + return sbm; +} +/*----------------------------------------------------------------------------*/ +struct tcp_send_buffer *SBInit(sb_manager_t sbm, uint32_t init_seq) +{ + struct tcp_send_buffer *buf; + + /* first try dequeue from free buffer queue */ + buf = SBDequeue(sbm->freeq); + if (!buf) { + buf = (struct tcp_send_buffer *)malloc(sizeof(struct tcp_send_buffer)); + if (!buf) { + perror("malloc() for buf"); + return NULL; + } + buf->data = MPAllocateChunk(sbm->mp); + if (!buf->data) { + TRACE_ERROR("Failed to fetch memory chunk for data.\n"); + free(buf); + return NULL; + } + sbm->cur_num++; + } + + buf->head = buf->data; + + buf->head_off = buf->tail_off = 0; + buf->len = buf->cum_len = 0; + buf->size = sbm->chunk_size; + + buf->init_seq = buf->head_seq = init_seq; + + return buf; +} +/*----------------------------------------------------------------------------*/ +#if 0 +static void +SBFreeInternal(sb_manager_t sbm, struct tcp_send_buffer *buf) +{ + if (!buf) + return; + + if (buf->data) { + MPFreeChunk(sbm->mp, buf->data); + buf->data = NULL; + } + + sbm->cur_num--; + free(buf); +} +#endif +/*----------------------------------------------------------------------------*/ +void SBFree(sb_manager_t sbm, struct tcp_send_buffer *buf) +{ + if (!buf) + return; + + SBEnqueue(sbm->freeq, buf); +} +/*----------------------------------------------------------------------------*/ +size_t SBPut(sb_manager_t sbm, struct tcp_send_buffer *buf, const void *data, size_t len) +{ + (void)sbm; + size_t to_put; + + if (len <= 0) + return 0; + + /* if no space, return -2 */ + to_put = MIN(len, buf->size - buf->len); + if (to_put <= 0) { + return -2; + } + + if (buf->tail_off + to_put < buf->size) { + /* if the data fit into the buffer, copy it */ + memcpy(buf->data + buf->tail_off, data, to_put); + buf->tail_off += to_put; + } else { + /* if buffer overflows, move the existing payload and merge */ + memmove(buf->data, buf->head, buf->len); + buf->head = buf->data; + buf->head_off = 0; + memcpy(buf->head + buf->len, data, to_put); + buf->tail_off = buf->len + to_put; + } + buf->len += to_put; + buf->cum_len += to_put; + + return to_put; +} +/*----------------------------------------------------------------------------*/ +size_t SBRemove(sb_manager_t sbm, struct tcp_send_buffer *buf, size_t len) +{ + (void)sbm; + size_t to_remove; + + if (len <= 0) + return 0; + + to_remove = MIN(len, buf->len); + if (to_remove <= 0) { + return -2; + } + + buf->head_off += to_remove; + buf->head = buf->data + buf->head_off; + buf->head_seq += to_remove; + buf->len -= to_remove; + + /* if buffer is empty, move the head to 0 */ + if (buf->len == 0 && buf->head_off > 0) { + buf->head = buf->data; + buf->head_off = buf->tail_off = 0; + } + + return to_remove; +} +/*---------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/tcp_stream.c b/lib/flash/mtcp/tcp_stream.c new file mode 100644 index 0000000..20cbf4b --- /dev/null +++ b/lib/flash/mtcp/tcp_stream.c @@ -0,0 +1,657 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tcp_stream.h" +#include "fhash.h" +#include "tcp_in.h" +#include "tcp_out.h" +#include "tcp_ring_buffer.h" +#include "tcp_send_buffer.h" +#include "eventpoll.h" +#include "ip_out.h" +#include "timer.h" +#include "debug.h" +#if RATE_LIMIT_ENABLED || PACING_ENABLED +#include "pacing.h" +#endif +#if USE_CCP +#include "ccp.h" +#endif + +#define TCP_MAX_SEQ 4294967295 + +/*---------------------------------------------------------------------------*/ +const char *state_str[] = { "TCP_ST_CLOSED", "TCP_ST_LISTEN", "TCP_ST_SYN_SENT", "TCP_ST_SYN_RCVD", + "TCP_ST_ESTABILSHED", "TCP_ST_FIN_WAIT_1", "TCP_ST_FIN_WAIT_2", "TCP_ST_CLOSE_WAIT", + "TCP_ST_CLOSING", "TCP_ST_LAST_ACK", "TCP_ST_TIME_WAIT" }; +/*---------------------------------------------------------------------------*/ +const char *close_reason_str[] = { "NOT_CLOSED", "CLOSE", "CLOSED", "CONN_FAIL", "CONN_LOST", "RESET", "NO_MEM", "DENIED", "TIMEDOUT" }; +/*---------------------------------------------------------------------------*/ +/* for rand_r() functions */ +static __thread unsigned int next_seed; +/*---------------------------------------------------------------------------*/ +inline const char *TCPStateToString(const tcp_stream *stream) +{ + return state_str[stream->state]; +} +/*---------------------------------------------------------------------------*/ +inline void InitializeTCPStreamManager(void) +{ + next_seed = time(NULL); +} +/*---------------------------------------------------------------------------*/ +unsigned int HashFlow(const void *f) +{ + const tcp_stream *flow = (const tcp_stream *)f; +#if 0 + unsigned long hash = 5381; + int c; + int index; + + char *str = (char *)&flow->saddr; + index = 0; + + while ((c = *str++) && index++ < 12) { + if (index == 8) { + str = (char *)&flow->sport; + } + hash = ((hash << 5) + hash) + c; + } + + return hash & (NUM_BINS_FLOWS - 1); +#else + unsigned int hash, i; + const char *key = (const char *)&flow->saddr; + + for (hash = i = 0; i < 12; ++i) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash & (NUM_BINS_FLOWS - 1); +#endif +} +/*---------------------------------------------------------------------------*/ +int EqualFlow(const void *f1, const void *f2) +{ + const tcp_stream *flow1 = (const tcp_stream *)f1; + const tcp_stream *flow2 = (const tcp_stream *)f2; + + return (flow1->saddr == flow2->saddr && flow1->sport == flow2->sport && flow1->daddr == flow2->daddr && + flow1->dport == flow2->dport); +} + +#if USE_CCP +/*---------------------------------------------------------------------------*/ +unsigned int HashSID(const void *f) +{ + tcp_stream *flow = (tcp_stream *)f; + return (flow->id % (NUM_BINS_FLOWS - 1)); +} + +int EqualSID(const void *f1, const void *f2) +{ + return (((tcp_stream *)f1)->id == ((tcp_stream *)f2)->id); +} +/*----------------------------------------------------------------------------*/ +#endif + +inline void RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream) +{ + if (stream->socket) { + if (stream->socket->epoll & MTCP_EPOLLIN) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLIN); +#if BLOCKING_SUPPORT + } else if (!(stream->socket->opts & MTCP_NONBLOCK)) { + if (!stream->on_rcv_br_list) { + stream->on_rcv_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt++; + } +#endif + } + } else { + TRACE_EPOLL("Stream %d: Raising read without a socket!\n", stream->id); + } +} +/*---------------------------------------------------------------------------*/ +inline void RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream) +{ + if (stream->socket) { + if (stream->socket->epoll & MTCP_EPOLLOUT) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLOUT); +#if BLOCKING_SUPPORT + } else if (!(stream->socket->opts & MTCP_NONBLOCK)) { + if (!stream->on_snd_br_list) { + stream->on_snd_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt++; + } +#endif + } + } else { + TRACE_EPOLL("Stream %d: Raising write without a socket!\n", stream->id); + } +} +/*---------------------------------------------------------------------------*/ +inline void RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream) +{ + if (stream->socket) { + if (stream->socket->epoll & MTCP_EPOLLRDHUP) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLRDHUP); + } else if (stream->socket->epoll & MTCP_EPOLLIN) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLIN); +#if BLOCKING_SUPPORT + } else if (!(stream->socket->opts & MTCP_NONBLOCK)) { + //pthread_cond_signal(&stream->rcvvar->read_cond); + //pthread_cond_signal(&stream->sndvar->write_cond); + if (!stream->on_rcv_br_list) { + stream->on_rcv_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt++; + } + if (!stream->on_snd_br_list) { + stream->on_snd_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt++; + } +#endif + } + } else { + TRACE_EPOLL("Stream %d: Raising close without a socket!\n", stream->id); + } +} +/*---------------------------------------------------------------------------*/ +inline void RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream) +{ + if (stream->socket) { + if (stream->socket->epoll & MTCP_EPOLLERR) { + AddEpollEvent(mtcp->ep, MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLERR); +#if BLOCKING_SUPPORT + } else if (!(stream->socket->opts & MTCP_NONBLOCK)) { + if (!stream->on_rcv_br_list) { + stream->on_rcv_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt++; + } + if (!stream->on_snd_br_list) { + stream->on_snd_br_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt++; + } +#endif + } + } else { + TRACE_EPOLL("Stream %d: Raising error without a socket!\n", stream->id); + } +} +/*---------------------------------------------------------------------------*/ +tcp_stream *CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type, uint32_t saddr, uint16_t sport, uint32_t daddr, + uint16_t dport) +{ + tcp_stream *stream = NULL; + int ret; + + uint8_t is_external; + uint8_t *sa; + uint8_t *da; + + pthread_mutex_lock(&mtcp->ctx->flow_pool_lock); + + stream = (tcp_stream *)MPAllocateChunk(mtcp->flow_pool); + if (!stream) { + TRACE_ERROR("Cannot allocate memory for the stream. " + "CONFIG.max_concurrency: %d, concurrent: %u\n", + CONFIG.max_concurrency, mtcp->flow_cnt); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + return NULL; + } + memset(stream, 0, sizeof(tcp_stream)); + + stream->rcvvar = (struct tcp_recv_vars *)MPAllocateChunk(mtcp->rv_pool); + if (!stream->rcvvar) { + MPFreeChunk(mtcp->flow_pool, stream); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + return NULL; + } + stream->sndvar = (struct tcp_send_vars *)MPAllocateChunk(mtcp->sv_pool); + if (!stream->sndvar) { + MPFreeChunk(mtcp->rv_pool, stream->rcvvar); + MPFreeChunk(mtcp->flow_pool, stream); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + return NULL; + } + memset(stream->rcvvar, 0, sizeof(struct tcp_recv_vars)); + memset(stream->sndvar, 0, sizeof(struct tcp_send_vars)); + + stream->id = mtcp->g_id++; + stream->saddr = saddr; + stream->sport = sport; + stream->daddr = daddr; + stream->dport = dport; + + ret = StreamHTInsert(mtcp->tcp_flow_table, stream); + if (ret < 0) { + TRACE_ERROR("Stream %d: " + "Failed to insert the stream into hash table.\n", + stream->id); + MPFreeChunk(mtcp->flow_pool, stream); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + return NULL; + } + +#if USE_CCP + ret = StreamHTInsert(mtcp->tcp_sid_table, stream); + if (ret < 0) { + TRACE_ERROR("Stream %d: " + "Failed to insert the stream into SID lookup table.\n", + stream->id); + MPFreeChunk(mtcp->flow_pool, stream); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + return NULL; + } +#endif + + stream->on_hash_table = TRUE; + mtcp->flow_cnt++; + + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + + if (socket) { + stream->socket = socket; + socket->stream = stream; + } + + stream->stream_type = type; + stream->state = TCP_ST_LISTEN; + + stream->on_rto_idx = -1; + + stream->sndvar->ip_id = 0; + stream->sndvar->mss = TCP_DEFAULT_MSS; + stream->sndvar->wscale_mine = TCP_DEFAULT_WSCALE; + stream->sndvar->wscale_peer = 0; + stream->sndvar->nif_out = GetOutputInterface(stream->daddr, &is_external); + stream->is_external = is_external; + + stream->sndvar->iss = rand_r(&next_seed) % TCP_MAX_SEQ; + //stream->sndvar->iss = 0; + stream->rcvvar->irs = 0; + + stream->snd_nxt = stream->sndvar->iss; + stream->sndvar->snd_una = stream->sndvar->iss; +#if USE_CCP + stream->sndvar->missing_seq = 0; +#endif + stream->sndvar->snd_wnd = CONFIG.sndbuf_size; + stream->rcv_nxt = 0; + stream->rcvvar->rcv_wnd = TCP_INITIAL_WINDOW; + + stream->rcvvar->snd_wl1 = stream->rcvvar->irs - 1; + + stream->sndvar->rto = TCP_INITIAL_RTO; + +#if BLOCKING_SUPPORT + if (pthread_cond_init(&stream->rcvvar->read_cond, NULL)) { + perror("pthread_cond_init of read_cond"); + return NULL; + } + if (pthread_cond_init(&stream->sndvar->write_cond, NULL)) { + perror("pthread_cond_init of write_cond"); + return NULL; + } +#endif + +#if USE_SPIN_LOCK + if (pthread_spin_init(&stream->rcvvar->read_lock, PTHREAD_PROCESS_PRIVATE)) { +#else + if (pthread_mutex_init(&stream->rcvvar->read_lock, NULL)) { +#endif + perror("pthread_mutex_init of read_lock"); +#if BLOCKING_SUPPORT + pthread_cond_destroy(&stream->rcvvar->read_cond); + pthread_cond_destroy(&stream->sndvar->write_cond); +#endif + return NULL; + } +#if USE_SPIN_LOCK + if (pthread_spin_init(&stream->sndvar->write_lock, PTHREAD_PROCESS_PRIVATE)) { + perror("pthread_spin_init of write_lock"); + pthread_spin_destroy(&stream->rcvvar->read_lock); +#else + if (pthread_mutex_init(&stream->sndvar->write_lock, NULL)) { + perror("pthread_mutex_init of write_lock"); + pthread_mutex_destroy(&stream->rcvvar->read_lock); +#endif +#if BLOCKING_SUPPORT + pthread_cond_destroy(&stream->rcvvar->read_cond); + pthread_cond_destroy(&stream->sndvar->write_cond); +#endif + return NULL; + } + + sa = (uint8_t *)&stream->saddr; + da = (uint8_t *)&stream->daddr; + TRACE_STREAM("CREATED NEW TCP STREAM %d: " + "%u.%u.%u.%u(%d) -> %u.%u.%u.%u(%d) (ISS: %u)\n", + stream->id, sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport), da[0], da[1], da[2], da[3], ntohs(stream->dport), + stream->sndvar->iss); + +#if RATE_LIMIT_ENABLED + stream->bucket = NewTokenBucket(); +#endif +#if PACING_ENABLED + stream->pacer = NewPacketPacer(); +#endif +#if USE_CCP + ccp_create(mtcp, stream); +#endif + + UNUSED(da); + UNUSED(sa); + return stream; +} +/*---------------------------------------------------------------------------*/ +void DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream) +{ + struct sockaddr_in addr; + int bound_addr = FALSE; + uint8_t *sa, *da; + int ret; + +#ifdef DUMP_STREAM + if (stream->close_reason != TCP_ACTIVE_CLOSE && stream->close_reason != TCP_PASSIVE_CLOSE) { + thread_printf(mtcp, mtcp->log_fp, "Stream %d abnormally closed.\n", stream->id); + DumpStream(mtcp, stream); + DumpControlList(mtcp, mtcp->n_sender[0]); + } +#endif + + sa = (uint8_t *)&stream->saddr; + da = (uint8_t *)&stream->daddr; + TRACE_STREAM("DESTROY TCP STREAM %d: " + "%u.%u.%u.%u(%d) -> %u.%u.%u.%u(%d) (%s)\n", + stream->id, sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport), da[0], da[1], da[2], da[3], ntohs(stream->dport), + close_reason_str[stream->close_reason]); + + if (stream->sndvar->sndbuf) { + TRACE_FSTAT("Stream %d: send buffer " + "cum_len: %lu, len: %u\n", + stream->id, stream->sndvar->sndbuf->cum_len, stream->sndvar->sndbuf->len); + } + if (stream->rcvvar->rcvbuf) { + TRACE_FSTAT("Stream %d: recv buffer " + "cum_len: %lu, merged_len: %u, last_len: %u\n", + stream->id, stream->rcvvar->rcvbuf->cum_len, stream->rcvvar->rcvbuf->merged_len, + stream->rcvvar->rcvbuf->last_len); + } + +#if RTM_STAT + /* Triple duplicated ack stats */ + if (stream->sndvar->rstat.tdp_ack_cnt) { + TRACE_FSTAT("Stream %d: triple duplicated ack: %u, " + "retransmission bytes: %u, average rtm bytes/ack: %u\n", + stream->id, stream->sndvar->rstat.tdp_ack_cnt, stream->sndvar->rstat.tdp_ack_bytes, + stream->sndvar->rstat.tdp_ack_bytes / stream->sndvar->rstat.tdp_ack_cnt); + } + + /* Retransmission timeout stats */ + if (stream->sndvar->rstat.rto_cnt > 0) { + TRACE_FSTAT("Stream %d: timeout count: %u, bytes: %u\n", stream->id, stream->sndvar->rstat.rto_cnt, + stream->sndvar->rstat.rto_bytes); + } + + /* Recovery stats */ + if (stream->sndvar->rstat.ack_upd_cnt) { + TRACE_FSTAT("Stream %d: snd_nxt update count: %u, " + "snd_nxt update bytes: %u, average update bytes/update: %u\n", + stream->id, stream->sndvar->rstat.ack_upd_cnt, stream->sndvar->rstat.ack_upd_bytes, + stream->sndvar->rstat.ack_upd_bytes / stream->sndvar->rstat.ack_upd_cnt); + } +#if TCP_OPT_SACK_ENABLED + if (stream->sndvar->rstat.sack_cnt) { + TRACE_FSTAT("Selective ack count: %u, bytes: %u, " + "average bytes/ack: %u\n", + stream->sndvar->rstat.sack_cnt, stream->sndvar->rstat.sack_bytes, + stream->sndvar->rstat.sack_bytes / stream->sndvar->rstat.sack_cnt); + } else { + TRACE_FSTAT("Selective ack count: %u, bytes: %u\n", stream->sndvar->rstat.sack_cnt, stream->sndvar->rstat.sack_bytes); + } + if (stream->sndvar->rstat.tdp_sack_cnt) { + TRACE_FSTAT("Selective tdp ack count: %u, bytes: %u, " + "average bytes/ack: %u\n", + stream->sndvar->rstat.tdp_sack_cnt, stream->sndvar->rstat.tdp_sack_bytes, + stream->sndvar->rstat.tdp_sack_bytes / stream->sndvar->rstat.tdp_sack_cnt); + } else { + TRACE_FSTAT("Selective ack count: %u, bytes: %u\n", stream->sndvar->rstat.tdp_sack_cnt, + stream->sndvar->rstat.tdp_sack_bytes); + } +#endif /* TCP_OPT_SACK_ENABLED */ +#endif /* RTM_STAT */ + + if (stream->is_bound_addr) { + bound_addr = TRUE; + addr.sin_addr.s_addr = stream->saddr; + addr.sin_port = stream->sport; + } + + RemoveFromControlList(mtcp, stream); + RemoveFromSendList(mtcp, stream); + RemoveFromACKList(mtcp, stream); + + if (stream->on_rto_idx >= 0) + RemoveFromRTOList(mtcp, stream); + + if (stream->on_timewait_list) + RemoveFromTimewaitList(mtcp, stream); + + if (CONFIG.tcp_timeout > 0) + RemoveFromTimeoutList(mtcp, stream); + +#if BLOCKING_SUPPORT + if (stream->on_snd_br_list) { + stream->on_snd_br_list = FALSE; + TAILQ_REMOVE(&mtcp->snd_br_list, stream, sndvar->snd_br_link); + mtcp->snd_br_list_cnt--; + } + if (stream->on_rcv_br_list) { + stream->on_rcv_br_list = FALSE; + TAILQ_REMOVE(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link); + mtcp->rcv_br_list_cnt--; + } + + if (!stream->epoll) { + pthread_cond_signal(&stream->rcvvar->read_cond); + pthread_cond_signal(&stream->sndvar->write_cond); + } + + if (pthread_cond_destroy(&stream->rcvvar->read_cond)) { + perror("pthread_cond_destroy of read_cond"); + } + if (pthread_cond_destroy(&stream->sndvar->write_cond)) { + perror("pthread_cond_destroy of write_cond"); + } +#endif + SBUF_LOCK_DESTROY(&stream->rcvvar->read_lock); + SBUF_LOCK_DESTROY(&stream->sndvar->write_lock); + + assert(stream->on_hash_table == TRUE); + + /* free ring buffers */ + if (stream->sndvar->sndbuf) { + SBFree(mtcp->rbm_snd, stream->sndvar->sndbuf); + stream->sndvar->sndbuf = NULL; + } + if (stream->rcvvar->rcvbuf) { + RBFree(mtcp->rbm_rcv, stream->rcvvar->rcvbuf); + stream->rcvvar->rcvbuf = NULL; + } + + pthread_mutex_lock(&mtcp->ctx->flow_pool_lock); + + /* remove from flow hash table */ + StreamHTRemove(mtcp->tcp_flow_table, stream); + stream->on_hash_table = FALSE; + + mtcp->flow_cnt--; + + MPFreeChunk(mtcp->rv_pool, stream->rcvvar); + MPFreeChunk(mtcp->sv_pool, stream->sndvar); + MPFreeChunk(mtcp->flow_pool, stream); + pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock); + + if (bound_addr) { + if (mtcp->ap) { + ret = FreeAddress(mtcp->ap, &addr); + } else { + uint8_t is_external; + int nif = GetOutputInterface(addr.sin_addr.s_addr, &is_external); + if (nif < 0) { + TRACE_ERROR("nif is negative!\n"); + ret = -1; + } else { + int eidx = CONFIG.nif_to_eidx[nif]; + ret = FreeAddress(ap[eidx], &addr); + } + UNUSED(is_external); + } + if (ret < 0) { + TRACE_ERROR("(NEVER HAPPEN) Failed to free address.\n"); + } + } + +#ifdef NETSTAT +#if NETSTAT_PERTHREAD + TRACE_STREAM("Destroyed. Remaining flows: %u\n", mtcp->flow_cnt); +#endif /* NETSTAT_PERTHREAD */ +#endif /* NETSTAT */ + + UNUSED(da); + UNUSED(sa); +} +/*---------------------------------------------------------------------------*/ +void DumpStream(mtcp_manager_t mtcp, tcp_stream *stream) +{ + uint8_t *sa, *da; + struct tcp_send_vars *sndvar = stream->sndvar; + struct tcp_recv_vars *rcvvar = stream->rcvvar; + + sa = (uint8_t *)&stream->saddr; + da = (uint8_t *)&stream->daddr; + thread_printf(mtcp, mtcp->log_fp, + "========== Stream %u: " + "%u.%u.%u.%u(%u) -> %u.%u.%u.%u(%u) ==========\n", + stream->id, sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport), da[0], da[1], da[2], da[3], ntohs(stream->dport)); + thread_printf(mtcp, mtcp->log_fp, "Stream id: %u, type: %u, state: %s, close_reason: %s\n", stream->id, stream->stream_type, + TCPStateToString(stream), close_reason_str[stream->close_reason]); + if (stream->socket) { + socket_map_t socket = stream->socket; + thread_printf(mtcp, mtcp->log_fp, + "Socket id: %d, type: %d, opts: %u\n" + "epoll: %u (IN: %u, OUT: %u, ERR: %u, RDHUP: %u, ET: %u)\n" + "events: %u (IN: %u, OUT: %u, ERR: %u, RDHUP: %u, ET: %u)\n", + socket->id, socket->socktype, socket->opts, socket->epoll, socket->epoll & MTCP_EPOLLIN, + socket->epoll & MTCP_EPOLLOUT, socket->epoll & MTCP_EPOLLERR, socket->epoll & MTCP_EPOLLRDHUP, + socket->epoll & MTCP_EPOLLET, socket->events, socket->events & MTCP_EPOLLIN, + socket->events & MTCP_EPOLLOUT, socket->events & MTCP_EPOLLERR, socket->events & MTCP_EPOLLRDHUP, + socket->events & MTCP_EPOLLET); + } else { + thread_printf(mtcp, mtcp->log_fp, "Socket: (null)\n"); + } + + thread_printf(mtcp, mtcp->log_fp, + "on_hash_table: %u, on_control_list: %u (wait: %u), on_send_list: %u, " + "on_ack_list: %u, is_wack: %u, ack_cnt: %u\n" + "on_rto_idx: %d, on_timewait_list: %u, on_timeout_list: %u, " + "on_rcv_br_list: %u, on_snd_br_list: %u\n" + "on_sendq: %u, on_ackq: %u, closed: %u, on_closeq: %u, " + "on_closeq_int: %u, on_resetq: %u, on_resetq_int: %u\n" + "have_reset: %u, is_fin_sent: %u, is_fin_ackd: %u, " + "saw_timestamp: %u, sack_permit: %u, " + "is_bound_addr: %u, need_wnd_adv: %u\n", + stream->on_hash_table, sndvar->on_control_list, stream->control_list_waiting, sndvar->on_send_list, + sndvar->on_ack_list, sndvar->is_wack, sndvar->ack_cnt, stream->on_rto_idx, stream->on_timewait_list, + stream->on_timeout_list, stream->on_rcv_br_list, stream->on_snd_br_list, sndvar->on_sendq, sndvar->on_ackq, + stream->closed, sndvar->on_closeq, sndvar->on_closeq_int, sndvar->on_resetq, sndvar->on_resetq_int, + stream->have_reset, sndvar->is_fin_sent, sndvar->is_fin_ackd, stream->saw_timestamp, stream->sack_permit, + stream->is_bound_addr, stream->need_wnd_adv); + + thread_printf(mtcp, mtcp->log_fp, "========== Send variables ==========\n"); + thread_printf(mtcp, mtcp->log_fp, + "ip_id: %u, mss: %u, eff_mss: %u, wscale (me, peer): (%u, %u), " + "nif_out: %d\n", + sndvar->ip_id, sndvar->mss, sndvar->eff_mss, sndvar->wscale_mine, sndvar->wscale_peer, sndvar->nif_out); + thread_printf(mtcp, mtcp->log_fp, + "snd_nxt: %u, snd_una: %u, iss: %u, fss: %u\nsnd_wnd: %u, " + "peer_wnd: %u, cwnd: %u, ssthresh: %u\n", + stream->snd_nxt, sndvar->snd_una, sndvar->iss, sndvar->fss, sndvar->snd_wnd, sndvar->peer_wnd, sndvar->cwnd, + sndvar->ssthresh); + + if (sndvar->sndbuf) { + thread_printf(mtcp, mtcp->log_fp, + "Send buffer: init_seq: %u, head_seq: %u, " + "len: %d, cum_len: %lu, size: %d\n", + sndvar->sndbuf->init_seq, sndvar->sndbuf->head_seq, sndvar->sndbuf->len, sndvar->sndbuf->cum_len, + sndvar->sndbuf->size); + } else { + thread_printf(mtcp, mtcp->log_fp, "Send buffer: (null)\n"); + } + thread_printf(mtcp, mtcp->log_fp, + "nrtx: %u, max_nrtx: %u, rto: %u, ts_rto: %u, " + "ts_lastack_sent: %u\n", + sndvar->nrtx, sndvar->max_nrtx, sndvar->rto, sndvar->ts_rto, sndvar->ts_lastack_sent); + + thread_printf(mtcp, mtcp->log_fp, "========== Receive variables ==========\n"); + thread_printf(mtcp, mtcp->log_fp, + "rcv_nxt: %u, irs: %u, rcv_wnd: %u, " + "snd_wl1: %u, snd_wl2: %u\n", + stream->rcv_nxt, rcvvar->irs, rcvvar->rcv_wnd, rcvvar->snd_wl1, rcvvar->snd_wl2); + if (rcvvar->rcvbuf) { + thread_printf(mtcp, mtcp->log_fp, + "Receive buffer: init_seq: %u, head_seq: %u, " + "merged_len: %d, cum_len: %lu, last_len: %d, size: %d\n", + rcvvar->rcvbuf->init_seq, rcvvar->rcvbuf->head_seq, rcvvar->rcvbuf->merged_len, rcvvar->rcvbuf->cum_len, + rcvvar->rcvbuf->last_len, rcvvar->rcvbuf->size); + } else { + thread_printf(mtcp, mtcp->log_fp, "Receive buffer: (null)\n"); + } + thread_printf(mtcp, mtcp->log_fp, "last_ack_seq: %u, dup_acks: %u\n", rcvvar->last_ack_seq, rcvvar->dup_acks); + thread_printf(mtcp, mtcp->log_fp, + "ts_recent: %u, ts_lastack_rcvd: %u, ts_last_ts_upd: %u, " + "ts_tw_expire: %u\n", + rcvvar->ts_recent, rcvvar->ts_lastack_rcvd, rcvvar->ts_last_ts_upd, rcvvar->ts_tw_expire); + thread_printf(mtcp, mtcp->log_fp, "srtt: %u, mdev: %u, mdev_max: %u, rttvar: %u, rtt_seq: %u\n", rcvvar->srtt, rcvvar->mdev, + rcvvar->mdev_max, rcvvar->rttvar, rcvvar->rtt_seq); +} diff --git a/lib/flash/mtcp/tcp_stream_queue.c b/lib/flash/mtcp/tcp_stream_queue.c new file mode 100644 index 0000000..a34b784 --- /dev/null +++ b/lib/flash/mtcp/tcp_stream_queue.c @@ -0,0 +1,203 @@ +/* + * TCP stream queue - tcp_stream_queue.c/h + * + * EunYoung Jeong + * + * Part of this code borrows Click's simple queue implementation + * + * ============================== Click License ============================= + * + * Copyright (c) 1999-2000 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, subject to the conditions + * listed in the Click LICENSE file. These conditions include: you must + * preserve this copyright notice, and you cannot mention the copyright + * holders in advertising related to the Software without their permission. + * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This + * notice is a summary of the Click LICENSE file; the license in that file is + * legally binding. + */ + +#include +#include + +#include "tcp_stream_queue.h" +#include "debug.h" + +#ifndef _INDEX_TYPE_ +#define _INDEX_TYPE_ +typedef uint32_t index_type; +typedef int32_t signed_index_type; +#endif +/*---------------------------------------------------------------------------*/ +struct stream_queue { + index_type _capacity; + volatile index_type _head; + volatile index_type _tail; + + struct tcp_stream *volatile *_q; +}; +/*----------------------------------------------------------------------------*/ +stream_queue_int *CreateInternalStreamQueue(int size) +{ + stream_queue_int *sq; + + sq = (stream_queue_int *)calloc(1, sizeof(stream_queue_int)); + if (!sq) { + return NULL; + } + + sq->array = (tcp_stream **)calloc(size, sizeof(tcp_stream *)); + if (!sq->array) { + free(sq); + return NULL; + } + + sq->size = size; + sq->first = sq->last = 0; + sq->count = 0; + + return sq; +} +/*----------------------------------------------------------------------------*/ +void DestroyInternalStreamQueue(stream_queue_int *sq) +{ + if (!sq) + return; + + if (sq->array) { + free(sq->array); + sq->array = NULL; + } + + free(sq); +} +/*----------------------------------------------------------------------------*/ +int StreamInternalEnqueue(stream_queue_int *sq, struct tcp_stream *stream) +{ + if (sq->count >= sq->size) { + /* queue is full */ + TRACE_INFO("[WARNING] Queue overflow. Set larger queue size! " + "count: %d, size: %d\n", + sq->count, sq->size); + return -1; + } + + sq->array[sq->last++] = stream; + sq->count++; + if (sq->last >= sq->size) { + sq->last = 0; + } + assert(sq->count <= sq->size); + + return 0; +} +/*----------------------------------------------------------------------------*/ +struct tcp_stream *StreamInternalDequeue(stream_queue_int *sq) +{ + struct tcp_stream *stream = NULL; + + if (sq->count <= 0) { + return NULL; + } + + stream = sq->array[sq->first++]; + assert(stream != NULL); + if (sq->first >= sq->size) { + sq->first = 0; + } + sq->count--; + assert(sq->count >= 0); + + return stream; +} +/*---------------------------------------------------------------------------*/ +static inline index_type NextIndex(stream_queue_t sq, index_type i) +{ + return (i != sq->_capacity ? i + 1 : 0); +} +/*---------------------------------------------------------------------------*/ +static inline index_type PrevIndex(stream_queue_t sq, index_type i) +{ + return (i != 0 ? i - 1 : sq->_capacity); +} +/*---------------------------------------------------------------------------*/ +int StreamQueueIsEmpty(stream_queue_t sq) +{ + return (sq->_head == sq->_tail); +} +/*---------------------------------------------------------------------------*/ +static inline void StreamMemoryBarrier(tcp_stream *volatile stream, volatile index_type index) +{ + __asm__ volatile("" : : "m"(stream), "m"(index)); +} +/*---------------------------------------------------------------------------*/ +stream_queue_t CreateStreamQueue(int capacity) +{ + stream_queue_t sq; + + sq = (stream_queue_t)calloc(1, sizeof(struct stream_queue)); + if (!sq) + return NULL; + + sq->_q = (tcp_stream **)calloc(capacity + 1, sizeof(tcp_stream *)); + if (!sq->_q) { + free(sq); + return NULL; + } + + sq->_capacity = capacity; + sq->_head = sq->_tail = 0; + + return sq; +} +/*---------------------------------------------------------------------------*/ +void DestroyStreamQueue(stream_queue_t sq) +{ + if (!sq) + return; + + if (sq->_q) { + void *q = (void *)(uintptr_t)sq->_q; + free(q); + sq->_q = NULL; + } + + free(sq); +} +/*---------------------------------------------------------------------------*/ +int StreamEnqueue(stream_queue_t sq, tcp_stream *stream) +{ + index_type h = sq->_head; + index_type t = sq->_tail; + index_type nt = NextIndex(sq, t); + + if (nt != h) { + sq->_q[t] = stream; + StreamMemoryBarrier(sq->_q[t], sq->_tail); + sq->_tail = nt; + return 0; + } + + TRACE_ERROR("Exceed capacity of stream queue!\n"); + return -1; +} +/*---------------------------------------------------------------------------*/ +tcp_stream *StreamDequeue(stream_queue_t sq) +{ + index_type h = sq->_head; + index_type t = sq->_tail; + + if (h != t) { + tcp_stream *stream = sq->_q[h]; + StreamMemoryBarrier(sq->_q[h], sq->_head); + sq->_head = NextIndex(sq, h); + assert(stream); + return stream; + } + + return NULL; +} +/*---------------------------------------------------------------------------*/ diff --git a/lib/flash/mtcp/tcp_util.c b/lib/flash/mtcp/tcp_util.c new file mode 100644 index 0000000..3e7ed66 --- /dev/null +++ b/lib/flash/mtcp/tcp_util.c @@ -0,0 +1,347 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "tcp_util.h" +#include "tcp_ring_buffer.h" +#include "eventpoll.h" +#include "debug.h" +#include "timer.h" +#include "ip_in.h" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/*---------------------------------------------------------------------------*/ +void ParseTCPOptions(tcp_stream *cur_stream, uint32_t cur_ts, const uint8_t *tcpopt, int len) +{ + int i; + int opt, optlen; + + for (i = 0; i < len;) { + opt = *(tcpopt + i++); + + if (opt == TCP_OPT_END) { // end of option field + break; + } else if (opt == TCP_OPT_NOP) { // no option + continue; + } else { + optlen = *(tcpopt + i++); + if (i + optlen - 2 > len) { + break; + } + + if (opt == TCP_OPT_MSS) { + cur_stream->sndvar->mss = *(tcpopt + i++) << 8; + cur_stream->sndvar->mss += *(tcpopt + i++); + cur_stream->sndvar->eff_mss = cur_stream->sndvar->mss; +#if TCP_OPT_TIMESTAMP_ENABLED + cur_stream->sndvar->eff_mss -= (TCP_OPT_TIMESTAMP_LEN + 2); +#endif + } else if (opt == TCP_OPT_WSCALE) { + cur_stream->sndvar->wscale_peer = *(tcpopt + i++); + } else if (opt == TCP_OPT_SACK_PERMIT) { + cur_stream->sack_permit = TRUE; + TRACE_SACK("Remote SACK permited.\n"); + } else if (opt == TCP_OPT_TIMESTAMP) { + TRACE_TSTAMP("Saw peer timestamp!\n"); + cur_stream->saw_timestamp = TRUE; + cur_stream->rcvvar->ts_recent = ntohl(*(const uint32_t *)(tcpopt + i)); + cur_stream->rcvvar->ts_last_ts_upd = cur_ts; + i += 8; + } else { + // not handle + i += optlen - 2; + } + } + } +} +/*---------------------------------------------------------------------------*/ +inline int ParseTCPTimestamp(tcp_stream *cur_stream, struct tcp_timestamp *ts, uint8_t *tcpopt, int len) +{ + (void)cur_stream; + int i; + int opt, optlen; + + for (i = 0; i < len;) { + opt = *(tcpopt + i++); + + if (opt == TCP_OPT_END) { // end of option field + break; + } else if (opt == TCP_OPT_NOP) { // no option + continue; + } else { + optlen = *(tcpopt + i++); + if (i + optlen - 2 > len) { + break; + } + + if (opt == TCP_OPT_TIMESTAMP) { + ts->ts_val = ntohl(*(uint32_t *)(tcpopt + i)); + ts->ts_ref = ntohl(*(uint32_t *)(tcpopt + i + 4)); + return TRUE; + } else { + // not handle + i += optlen - 2; + } + } + } + return FALSE; +} +#if TCP_OPT_SACK_ENABLED +/*----------------------------------------------------------------------------*/ +int SeqIsSacked(tcp_stream *cur_stream, uint32_t seq) +{ + uint8_t i; + uint32_t left, right; + for (i = 0; i < MAX_SACK_ENTRY; i++) { + left = cur_stream->rcvvar->sack_table[i].left_edge; + right = cur_stream->rcvvar->sack_table[i].right_edge; + if (seq >= left && seq < right) { + //fprintf(stderr, "Found seq=%u in (%u,%u)\n", seq - cur_stream->sndvar->iss, left - cur_stream->sndvar->iss, right - cur_stream->sndvar->iss); + return TRUE; + } + } + return FALSE; +} +/*----------------------------------------------------------------------------*/ +static void _update_sack_table(tcp_stream *cur_stream, uint32_t left_edge, uint32_t right_edge) +{ + uint8_t i, j; + uint32_t newly_sacked = 0; + long int ld, rd, lrd, rld; + for (i = 0; i < MAX_SACK_ENTRY; i++) { + ld = (long int)left_edge - cur_stream->rcvvar->sack_table[i].left_edge; + rd = (long int)right_edge - cur_stream->rcvvar->sack_table[i].right_edge; + // if block already in table, don't need to do anything + if (ld == 0 && rd == 0) { + return; + } + + lrd = (long int)left_edge - cur_stream->rcvvar->sack_table[i].right_edge; + rld = (long int)right_edge - cur_stream->rcvvar->sack_table[i].left_edge; + + // if block does not overlap i at all, skip + if (lrd > 0 || rld < 0) { + continue; + } + + // left_edge is further left than i.left_edge + if (ld < 0) { + newly_sacked += (-ld); + // expand i to account for this extra space, and merge with any + // blocks whose right_edge = i.left (i.e. blocks are touching) + cur_stream->rcvvar->sack_table[i].left_edge = left_edge; + for (j = 0; j < MAX_SACK_ENTRY; j++) { + if (cur_stream->rcvvar->sack_table[j].right_edge == left_edge) { + cur_stream->rcvvar->sack_table[i].left_edge = cur_stream->rcvvar->sack_table[j].right_edge; + cur_stream->rcvvar->sack_table[j].left_edge = 0; + cur_stream->rcvvar->sack_table[j].right_edge = 0; + break; + } + } + } + // right edge is further right than i.right_edge + if (rd > 0) { + newly_sacked += rd; + // expand i to account for this extra space, and merge with any + // blocks whose left_edge = i.right (i.e. blocks are touching) + cur_stream->rcvvar->sack_table[i].right_edge = right_edge; + for (j = 0; j < MAX_SACK_ENTRY; j++) { + if (cur_stream->rcvvar->sack_table[j].left_edge == right_edge) { + cur_stream->rcvvar->sack_table[i].right_edge = cur_stream->rcvvar->sack_table[j].left_edge; + cur_stream->rcvvar->sack_table[j].left_edge = 0; + cur_stream->rcvvar->sack_table[j].right_edge = 0; + break; + } + } + } + } + if (newly_sacked == 0) { + cur_stream->rcvvar->sack_table[cur_stream->rcvvar->sacks].left_edge = left_edge; + cur_stream->rcvvar->sack_table[cur_stream->rcvvar->sacks].right_edge = right_edge; + cur_stream->rcvvar->sacks++; + newly_sacked = (right_edge - left_edge); + } + + //fprintf(stderr, "SACK (%u,%u)->%u/%u\n", left_edge, right_edge, newly_sacked, newly_sacked / 1448); + cur_stream->rcvvar->sacked_pkts += (newly_sacked / cur_stream->sndvar->mss); + + return; +} +/*----------------------------------------------------------------------------*/ +// static int GenerateSACKOption(tcp_stream *cur_stream, uint8_t *tcpopt) +// { +// (void)cur_stream; +// (void)tcpopt; +// // TODO +// return 0; +// } +/*----------------------------------------------------------------------------*/ +void ParseSACKOption(tcp_stream *cur_stream, uint32_t ack_seq, uint8_t *tcpopt, int len) +{ + ack_seq = ack_seq; // to avoid warning + int i, j; + int opt, optlen; + uint32_t left_edge, right_edge; + + for (i = 0; i < len;) { + opt = *(tcpopt + i++); + + if (opt == TCP_OPT_END) { // end of option field + break; + } else if (opt == TCP_OPT_NOP) { // no option + continue; + } else { + optlen = *(tcpopt + i++); + if (i + optlen - 2 > len) { + break; + } + + if (opt == TCP_OPT_SACK) { + j = 0; + while (j < optlen - 2) { + left_edge = ntohl(*(uint32_t *)(tcpopt + i + j)); + right_edge = ntohl(*(uint32_t *)(tcpopt + i + j + 4)); + + _update_sack_table(cur_stream, left_edge, right_edge); + + j += 8; +#if RTM_STAT + cur_stream->rstat->sack_cnt++; + cur_stream->rstat->sack_bytes += (right_edge - left_edge); +#endif + if (cur_stream->rcvvar->dup_acks == 3) { +#if RTM_STAT + cur_stream->rstat->tdp_sack_cnt++; + cur_stream->rstat->tdp_sack_bytes += (right_edge - left_edge); +#endif + TRACE_LOSS("SACK entry. " + "left_edge: %u, right_edge: %u (ack_seq: %u)\n", + left_edge, right_edge, ack_seq); + } + TRACE_SACK("Found SACK entry. " + "left_edge: %u, right_edge: %u\n", + left_edge, right_edge); + } + i += j; + } else { + // not handle + i += optlen - 2; + } + } + } +} +#endif /* TCP_OPT_SACK_ENABLED */ +/*---------------------------------------------------------------------------*/ +uint16_t TCPCalcChecksum(uint16_t *buf, uint16_t len, uint32_t saddr, uint32_t daddr) +{ + uint32_t sum; + uint16_t *w; + int nleft; + + sum = 0; + nleft = len; + w = buf; + + while (nleft > 1) { + sum += *w++; + nleft -= 2; + } + + // add padding for odd length + if (nleft) + sum += *w & ntohs(0xFF00); + + // add pseudo header + sum += (saddr & 0x0000FFFF) + (saddr >> 16); + sum += (daddr & 0x0000FFFF) + (daddr >> 16); + sum += htons(len); + sum += htons(IPPROTO_TCP); + + sum = (sum >> 16) + (sum & 0xFFFF); + sum += (sum >> 16); + + sum = ~sum; + + return (uint16_t)sum; +} +/*---------------------------------------------------------------------------*/ +void PrintTCPOptions(uint8_t *tcpopt, int len) +{ + int i; + unsigned int opt, optlen; + + for (i = 0; i < len; i++) { + printf("%u ", tcpopt[i]); + } + printf("\n"); + + for (i = 0; i < len;) { + opt = *(tcpopt + i++); + + if (opt == TCP_OPT_END) { // end of option field + break; + } else if (opt == TCP_OPT_NOP) { // no option + continue; + } else { + optlen = *(tcpopt + i++); + + printf("Option: %d", opt); + printf(", length: %d", optlen); + + if (opt == TCP_OPT_MSS) { + uint16_t mss; + mss = *(tcpopt + i++) << 8; + mss += *(tcpopt + i++); + printf(", MSS: %u", mss); + } else if (opt == TCP_OPT_SACK_PERMIT) { + printf(", SACK permit"); + } else if (opt == TCP_OPT_TIMESTAMP) { + uint32_t ts_val, ts_ref; + ts_val = *(uint32_t *)(tcpopt + i); + i += 4; + ts_ref = *(uint32_t *)(tcpopt + i); + i += 4; + printf(", TSval: %u, TSref: %u", ts_val, ts_ref); + } else if (opt == TCP_OPT_WSCALE) { + uint8_t wscale; + wscale = *(tcpopt + i++); + printf(", Wscale: %u", wscale); + } else { + // not handle + i += optlen - 2; + } + printf("\n"); + } + } +} diff --git a/lib/flash/mtcp/timer.c b/lib/flash/mtcp/timer.c new file mode 100644 index 0000000..c9445f1 --- /dev/null +++ b/lib/flash/mtcp/timer.c @@ -0,0 +1,515 @@ +/* + * mTCP source code is distributed under the Modified BSD Licence. + * + * Copyright (C) 2015 EunYoung Jeong, Shinae Woo, Muhammad Jamshed, Haewon Jeong, + * Sunghwan Ihm, Dongsu Han, KyoungSoo Park + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "timer.h" +#include "tcp_in.h" +#include "tcp_out.h" +#include "stat.h" +#include "debug.h" +#if USE_CCP +#include "ccp.h" +#endif + +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/*----------------------------------------------------------------------------*/ +struct rto_hashstore *InitRTOHashstore(void) +{ + int i; + struct rto_hashstore *hs = calloc(1, sizeof(struct rto_hashstore)); + if (!hs) { + TRACE_ERROR("calloc: InitHashStore"); + return 0; + } + + for (i = 0; i < RTO_HASH; i++) + TAILQ_INIT(&hs->rto_list[i]); + + TAILQ_INIT(&hs->rto_list[RTO_HASH]); + + return hs; +} +/*----------------------------------------------------------------------------*/ +inline void AddtoRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (!mtcp->rto_list_cnt) { + mtcp->rto_store->rto_now_idx = 0; + mtcp->rto_store->rto_now_ts = cur_stream->sndvar->ts_rto; + } + + if (cur_stream->on_rto_idx < 0) { + if (cur_stream->on_timewait_list) { + TRACE_ERROR("Stream %u: cannot be in both " + "rto and timewait list.\n", + cur_stream->id); +#ifdef DUMP_STREAM + DumpStream(mtcp, cur_stream); +#endif + return; + } + + int diff = (int32_t)(cur_stream->sndvar->ts_rto - mtcp->rto_store->rto_now_ts); + if (diff < RTO_HASH) { + int offset = (diff + mtcp->rto_store->rto_now_idx) % RTO_HASH; + cur_stream->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mtcp->rto_store->rto_list[offset]), cur_stream, sndvar->timer_link); + } else { + cur_stream->on_rto_idx = RTO_HASH; + TAILQ_INSERT_TAIL(&(mtcp->rto_store->rto_list[RTO_HASH]), cur_stream, sndvar->timer_link); + } + mtcp->rto_list_cnt++; + } +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (cur_stream->on_rto_idx < 0) { + // assert(0); + return; + } + + TAILQ_REMOVE(&mtcp->rto_store->rto_list[cur_stream->on_rto_idx], cur_stream, sndvar->timer_link); + cur_stream->on_rto_idx = -1; + + mtcp->rto_list_cnt--; +} +/*----------------------------------------------------------------------------*/ +inline void AddtoTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts) +{ + cur_stream->rcvvar->ts_tw_expire = cur_ts + CONFIG.tcp_timewait; + + if (cur_stream->on_timewait_list) { + // Update list in sorted way by ts_tw_expire + TAILQ_REMOVE(&mtcp->timewait_list, cur_stream, sndvar->timer_link); + TAILQ_INSERT_TAIL(&mtcp->timewait_list, cur_stream, sndvar->timer_link); + } else { + if (cur_stream->on_rto_idx >= 0) { + TRACE_DBG("Stream %u: cannot be in both " + "timewait and rto list.\n", + cur_stream->id); + //assert(0); +#ifdef DUMP_STREAM + DumpStream(mtcp, cur_stream); +#endif + RemoveFromRTOList(mtcp, cur_stream); + } + + cur_stream->on_timewait_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->timewait_list, cur_stream, sndvar->timer_link); + mtcp->timewait_list_cnt++; + } +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (!cur_stream->on_timewait_list) { + assert(0); + return; + } + + TAILQ_REMOVE(&mtcp->timewait_list, cur_stream, sndvar->timer_link); + cur_stream->on_timewait_list = FALSE; + mtcp->timewait_list_cnt--; +} +/*----------------------------------------------------------------------------*/ +inline void AddtoTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (cur_stream->on_timeout_list) { + assert(0); + return; + } + + cur_stream->on_timeout_list = TRUE; + TAILQ_INSERT_TAIL(&mtcp->timeout_list, cur_stream, sndvar->timeout_link); + mtcp->timeout_list_cnt++; +} +/*----------------------------------------------------------------------------*/ +inline void RemoveFromTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (cur_stream->on_timeout_list) { + cur_stream->on_timeout_list = FALSE; + TAILQ_REMOVE(&mtcp->timeout_list, cur_stream, sndvar->timeout_link); + mtcp->timeout_list_cnt--; + } +} +/*----------------------------------------------------------------------------*/ +inline void UpdateTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream) +{ + if (cur_stream->on_timeout_list) { + TAILQ_REMOVE(&mtcp->timeout_list, cur_stream, sndvar->timeout_link); + TAILQ_INSERT_TAIL(&mtcp->timeout_list, cur_stream, sndvar->timeout_link); + } +} +/*----------------------------------------------------------------------------*/ +inline void UpdateRetransmissionTimer(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts) +{ + /* Update the retransmission timer */ + assert(cur_stream->sndvar->rto > 0); + cur_stream->sndvar->nrtx = 0; + + /* if in rto list, remove it */ + if (cur_stream->on_rto_idx >= 0) { + RemoveFromRTOList(mtcp, cur_stream); + } + + /* Reset retransmission timeout */ + if (TCP_SEQ_GT(cur_stream->snd_nxt, cur_stream->sndvar->snd_una)) { + /* there are packets sent but not acked */ + /* update rto timestamp */ + cur_stream->sndvar->ts_rto = cur_ts + cur_stream->sndvar->rto; + AddtoRTOList(mtcp, cur_stream); + + } else { + /* all packets are acked */ + TRACE_RTO("All packets are acked. snd_una: %u, snd_nxt: %u\n", cur_stream->sndvar->snd_una, cur_stream->snd_nxt); + } +} +/*----------------------------------------------------------------------------*/ +static int HandleRTO(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream) +{ + uint8_t backoff; + + TRACE_RTO("Stream %d Timeout! rto: %u (%ums), snd_una: %u, snd_nxt: %u\n", cur_stream->id, cur_stream->sndvar->rto, + TS_TO_MSEC(cur_stream->sndvar->rto), cur_stream->sndvar->snd_una, cur_stream->snd_nxt); + assert(cur_stream->sndvar->rto > 0); + + /* if the stream is ready to be closed, don't handle RTO */ + if (cur_stream->close_reason != TCP_NOT_CLOSED) + return 0; + +#if USE_CCP + ccp_record_event(mtcp, cur_stream, EVENT_TIMEOUT, 0); +#endif + + /* count number of retransmissions */ + if (cur_stream->sndvar->nrtx < TCP_MAX_RTX) { + cur_stream->sndvar->nrtx++; + } else { + /* if it exceeds the threshold, destroy and notify to application */ + TRACE_RTO("Stream %d: Exceed MAX_RTX\n", cur_stream->id); + if (cur_stream->state < TCP_ST_ESTABLISHED) { + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_CONN_FAIL; + DestroyTCPStream(mtcp, cur_stream); + } else { + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_CONN_LOST; + if (cur_stream->socket) { + RaiseErrorEvent(mtcp, cur_stream); + } else { + DestroyTCPStream(mtcp, cur_stream); + } + } + + return ERROR; + } + if (cur_stream->sndvar->nrtx > cur_stream->sndvar->max_nrtx) { + cur_stream->sndvar->max_nrtx = cur_stream->sndvar->nrtx; + } + + /* update rto timestamp */ + if (cur_stream->state >= TCP_ST_ESTABLISHED) { + uint32_t rto_prev; + backoff = MIN(cur_stream->sndvar->nrtx, TCP_MAX_BACKOFF); + + rto_prev = cur_stream->sndvar->rto; + cur_stream->sndvar->rto = ((cur_stream->rcvvar->srtt >> 3) + cur_stream->rcvvar->rttvar) << backoff; + if (cur_stream->sndvar->rto <= 0) { + TRACE_RTO("Stream %d current rto: %u, prev: %u, state: %s\n", cur_stream->id, cur_stream->sndvar->rto, + rto_prev, TCPStateToString(cur_stream)); + cur_stream->sndvar->rto = rto_prev; + } + } else if (cur_stream->state >= TCP_ST_SYN_SENT) { + /* if there is no rtt measured, update rto based on the previous one */ + if (cur_stream->sndvar->nrtx < TCP_MAX_BACKOFF) { + cur_stream->sndvar->rto <<= 1; + } + } + //cur_stream->sndvar->ts_rto = cur_ts + cur_stream->sndvar->rto; + + /* reduce congestion window and ssthresh */ + cur_stream->sndvar->ssthresh = MIN(cur_stream->sndvar->cwnd, cur_stream->sndvar->peer_wnd) / 2; + if (cur_stream->sndvar->ssthresh < (2 * cur_stream->sndvar->mss)) { + cur_stream->sndvar->ssthresh = cur_stream->sndvar->mss * 2; + } + cur_stream->sndvar->cwnd = cur_stream->sndvar->mss; + TRACE_CONG("Stream %d Timeout. cwnd: %u, ssthresh: %u\n", cur_stream->id, cur_stream->sndvar->cwnd, + cur_stream->sndvar->ssthresh); + +#if RTM_STAT + /* update retransmission stats */ + cur_stream->sndvar->rstat.rto_cnt++; + cur_stream->sndvar->rstat.rto_bytes += (cur_stream->snd_nxt - cur_stream->sndvar->snd_una); +#endif + + /* Retransmission */ + if (cur_stream->state == TCP_ST_SYN_SENT) { + /* SYN lost */ + if (cur_stream->sndvar->nrtx > TCP_MAX_SYN_RETRY) { + cur_stream->state = TCP_ST_CLOSED; + cur_stream->close_reason = TCP_CONN_FAIL; + TRACE_RTO("Stream %d: SYN retries exceed maximum retries.\n", cur_stream->id); + if (cur_stream->socket) { + RaiseErrorEvent(mtcp, cur_stream); + } else { + DestroyTCPStream(mtcp, cur_stream); + } + + return ERROR; + } + TRACE_RTO("Stream %d Retransmit SYN. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + + } else if (cur_stream->state == TCP_ST_SYN_RCVD) { + /* SYN/ACK lost */ + TRACE_RTO("Stream %d: Retransmit SYN/ACK. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + + } else if (cur_stream->state == TCP_ST_ESTABLISHED) { + /* Data lost */ + TRACE_RTO("Stream %d: Retransmit data. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + + } else if (cur_stream->state == TCP_ST_CLOSE_WAIT) { + /* Data lost */ + TRACE_RTO("Stream %d: Retransmit data. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + + } else if (cur_stream->state == TCP_ST_LAST_ACK) { + /* FIN/ACK lost */ + TRACE_RTO("Stream %d: Retransmit FIN/ACK. " + "snd_nxt: %u, snd_una: %u\n", + cur_stream->id, cur_stream->snd_nxt, cur_stream->sndvar->snd_una); + + } else if (cur_stream->state == TCP_ST_FIN_WAIT_1) { + /* FIN lost */ + TRACE_RTO("Stream %d: Retransmit FIN. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + } else if (cur_stream->state == TCP_ST_CLOSING) { + TRACE_RTO("Stream %d: Retransmit ACK. snd_nxt: %u, snd_una: %u\n", cur_stream->id, cur_stream->snd_nxt, + cur_stream->sndvar->snd_una); + //TRACE_DBG("Stream %d: Retransmitting at CLOSING\n", cur_stream->id); + + } else { + TRACE_ERROR("Stream %d: not implemented state! state: %s, rto: %u\n", cur_stream->id, TCPStateToString(cur_stream), + cur_stream->sndvar->rto); + assert(0); + return ERROR; + } + + if (cur_stream->have_reset && cur_stream->state == TCP_ST_SYN_RCVD) { + DestroyTCPStream(mtcp, cur_stream); + return 0; + } + + cur_stream->snd_nxt = cur_stream->sndvar->snd_una; + if (cur_stream->state == TCP_ST_ESTABLISHED || cur_stream->state == TCP_ST_CLOSE_WAIT) { + /* retransmit data at ESTABLISHED state */ + AddtoSendList(mtcp, cur_stream); + + } else if (cur_stream->state == TCP_ST_FIN_WAIT_1 || cur_stream->state == TCP_ST_CLOSING || + cur_stream->state == TCP_ST_LAST_ACK) { + if (cur_stream->sndvar->fss == 0) { + TRACE_ERROR("Stream %u: fss not set.\n", cur_stream->id); + } + /* decide to retransmit data or control packet */ + if (TCP_SEQ_LT(cur_stream->snd_nxt, cur_stream->sndvar->fss)) { + /* need to retransmit data */ + if (cur_stream->sndvar->on_control_list) { + RemoveFromControlList(mtcp, cur_stream); + } + cur_stream->control_list_waiting = TRUE; + AddtoSendList(mtcp, cur_stream); + + } else { + /* need to retransmit control packet */ + AddtoControlList(mtcp, cur_stream, cur_ts); + } + + } else { + AddtoControlList(mtcp, cur_stream, cur_ts); + } + + return 0; +} +/*----------------------------------------------------------------------------*/ +static inline void RearrangeRTOStore(mtcp_manager_t mtcp) +{ + tcp_stream *walk, *next; + struct rto_head *rto_list = &mtcp->rto_store->rto_list[RTO_HASH]; + int cnt = 0; + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) { + next = TAILQ_NEXT(walk, sndvar->timer_link); + + int diff = (int32_t)(mtcp->rto_store->rto_now_ts - walk->sndvar->ts_rto); + if (diff < RTO_HASH) { + int offset = (diff + mtcp->rto_store->rto_now_idx) % RTO_HASH; + TAILQ_REMOVE(&mtcp->rto_store->rto_list[RTO_HASH], walk, sndvar->timer_link); + walk->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mtcp->rto_store->rto_list[offset]), walk, sndvar->timer_link); + } + cnt++; + } +} +/*----------------------------------------------------------------------------*/ +void CheckRtmTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh) +{ + tcp_stream *walk, *next; + struct rto_head *rto_list; + int cnt; + + if (!mtcp->rto_list_cnt) { + return; + } + + STAT_COUNT(mtcp->runstat.rounds_rtocheck); + + cnt = 0; + + while (1) { + rto_list = &mtcp->rto_store->rto_list[mtcp->rto_store->rto_now_idx]; + if ((int32_t)(cur_ts - mtcp->rto_store->rto_now_ts) < 0) { + break; + } + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) { + if (++cnt > thresh) { + break; + } + next = TAILQ_NEXT(walk, sndvar->timer_link); + + TRACE_LOOP("Inside rto list. cnt: %u, stream: %d\n", cnt, walk->s_id); + + if (walk->on_rto_idx >= 0) { + TAILQ_REMOVE(rto_list, walk, sndvar->timer_link); + mtcp->rto_list_cnt--; + walk->on_rto_idx = -1; + HandleRTO(mtcp, cur_ts, walk); + } else { + TRACE_ERROR("Stream %d: not on rto list.\n", walk->id); +#ifdef DUMP_STREAM + DumpStream(mtcp, walk); +#endif + } + } + + if (cnt > thresh) { + break; + } else { + mtcp->rto_store->rto_now_idx = (mtcp->rto_store->rto_now_idx + 1) % RTO_HASH; + mtcp->rto_store->rto_now_ts++; + if (!(mtcp->rto_store->rto_now_idx % 1000)) { + RearrangeRTOStore(mtcp); + } + } + } + + TRACE_ROUND("Checking retransmission timeout. cnt: %d\n", cnt); +} +/*----------------------------------------------------------------------------*/ +void CheckTimewaitExpire(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh) +{ + tcp_stream *walk, *next; + int cnt; + + STAT_COUNT(mtcp->runstat.rounds_twcheck); + + cnt = 0; + + for (walk = TAILQ_FIRST(&mtcp->timewait_list); walk != NULL; walk = next) { + if (++cnt > thresh) + break; + next = TAILQ_NEXT(walk, sndvar->timer_link); + + TRACE_LOOP("Inside timewait list. cnt: %u, stream: %d\n", cnt, walk->s_id); + + if (walk->on_timewait_list) { + if ((int32_t)(cur_ts - walk->rcvvar->ts_tw_expire) >= 0) { + if (!walk->sndvar->on_control_list) { + TAILQ_REMOVE(&mtcp->timewait_list, walk, sndvar->timer_link); + walk->on_timewait_list = FALSE; + mtcp->timewait_list_cnt--; + + walk->state = TCP_ST_CLOSED; + walk->close_reason = TCP_ACTIVE_CLOSE; + TRACE_STATE("Stream %d: TCP_ST_CLOSED\n", walk->id); + DestroyTCPStream(mtcp, walk); + } + } else { + break; + } + } else { + TRACE_ERROR("Stream %d: not on timewait list.\n", walk->id); +#ifdef DUMP_STREAM + DumpStream(mtcp, walk); +#endif + } + } + + TRACE_ROUND("Checking timewait timeout. cnt: %d\n", cnt); +} +/*----------------------------------------------------------------------------*/ +void CheckConnectionTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh) +{ + tcp_stream *walk, *next; + int cnt; + + STAT_COUNT(mtcp->runstat.rounds_tocheck); + + cnt = 0; + for (walk = TAILQ_FIRST(&mtcp->timeout_list); walk != NULL; walk = next) { + if (++cnt > thresh) + break; + next = TAILQ_NEXT(walk, sndvar->timeout_link); + + if ((int32_t)(cur_ts - walk->last_active_ts) >= CONFIG.tcp_timeout) { + walk->on_timeout_list = FALSE; + TAILQ_REMOVE(&mtcp->timeout_list, walk, sndvar->timeout_link); + mtcp->timeout_list_cnt--; + walk->state = TCP_ST_CLOSED; + walk->close_reason = TCP_TIMEDOUT; + if (walk->socket) { + RaiseErrorEvent(mtcp, walk); + } else { + DestroyTCPStream(mtcp, walk); + } + } else { + break; + } + } +} +/*----------------------------------------------------------------------------*/ diff --git a/meson.build b/meson.build index 7c79d63..f8edace 100644 --- a/meson.build +++ b/meson.build @@ -139,6 +139,27 @@ foreach arg : warning_flags endif endforeach +# Enable mtcp if the option is set +if get_option('enable_mtcp') + mtcp_flag = [ + '-DDISABLE_ONVM', + '-DDISABLE_PSIO', + '-DDISABLE_DPDK', + '-DDISABLE_NETMAP', + '-fgnu89-inline', + '-DNDEBUG', + '-DNETSTAT', + '-DINFO', + '-DDBGERR', + '-DDBGCERR', + '-D__USRLIB__', + ] + + foreach arg : mtcp_flag + add_project_arguments(arg, language: 'c') + endforeach +endif + # set loggging options for log library if get_option('log_use_color') add_project_arguments('-DLOG_USE_COLOR', language: 'c') diff --git a/meson_options.txt b/meson_options.txt index 032ea14..2030945 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -6,3 +6,6 @@ option('log_use_color', type: 'boolean', value: true, option('enable_rust', type: 'boolean', value: true, description: 'Enable building Rust applications and libraries') + +option('enable_mtcp', type: 'boolean', value: true, + description: 'Enable building mtcp libraries and examples') From bac55eb81699f2ce6d2774e1c3f1957d5f66f0ae Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 10 Sep 2025 17:29:45 +0530 Subject: [PATCH 22/43] feat: added a new helper for hexdump --- lib/flash/nf/flash_helpers.c | 37 ++++++++++++++++++++++++++++++++++++ lib/flash/nf/flash_nf.h | 11 +++++++++++ lib/include/flash_list.h | 1 + 3 files changed, 49 insertions(+) diff --git a/lib/flash/nf/flash_helpers.c b/lib/flash/nf/flash_helpers.c index 97901ec..9ec2b9c 100644 --- a/lib/flash/nf/flash_helpers.c +++ b/lib/flash/nf/flash_helpers.c @@ -41,4 +41,41 @@ int flash__get_macaddr(struct config *cfg, struct ether_addr *addr) close(fd); return 0; +} + +void flash__hex_dump(void *pkt, size_t length, bool verbose) +{ + const unsigned char *address = (unsigned char *)pkt; + const unsigned char *line = address; + size_t line_size = 32; + unsigned char c; + char buf[32]; + int i = 0; + + if (verbose) { + printf("length = %zu\n", length); + printf("%s | ", buf); + } + + while (length-- > 0) { + printf("%02X ", *address++); + if (!(++i % line_size) || (length == 0 && i % line_size)) { + if (length == 0) { + while (i++ % line_size) + printf("__ "); + } + + if (verbose) { + printf(" | "); /* right close */ + while (line < address) { + c = *line++; + printf("%c", (c < 33 || c == 255) ? 0x2E : c); + } + } + printf("\n"); + if (length > 0 && verbose) + printf("%s | ", buf); + } + } + printf("\n"); } \ No newline at end of file diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index bf60a48..8f1cee5 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -155,6 +155,17 @@ void *flash__stats_thread(void *conf); */ int flash__get_macaddr(struct config *cfg, struct ether_addr *addr); +/** + * Dump the contents of a packet in hexadecimal format. + * + * @param pkt: Pointer to the packet data. + * @param length: Length of the packet data. + * @param verbose: If true, print additional information. + * + * This function prints the packet data in a human-readable hexadecimal format. + */ +void flash__hex_dump(void *pkt, size_t length, bool verbose); + /* Advanced APIs */ void flash__populate_fill_ring(struct thread **thread, int frame_size, int total_sockets, int umem_offset, int umem_scale); diff --git a/lib/include/flash_list.h b/lib/include/flash_list.h index c7c1df7..d650455 100644 --- a/lib/include/flash_list.h +++ b/lib/include/flash_list.h @@ -60,6 +60,7 @@ static inline int list_empty(const struct list_head *head) return head->next == head; } +#define container_of(ptr, type, member) ((type *)((char *)(ptr) - offsetof(type, member))) #define list_entry(ptr, type, member) container_of(ptr, type, member) #define list_first_entry(ptr, type, member) list_entry((ptr)->next, type, member) #define list_next_entry(pos, member) list_entry((pos)->member.next, typeof(*(pos)), member) From 04ae370d4e1fd4d42c772a6341158556d39bdc03 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 15 Sep 2025 13:43:27 +0530 Subject: [PATCH 23/43] fix: updated maglev, mica, simple-firewall, unit-tests/backpressure to the new library API. --- examples/maglev/main.c | 410 ++++++++++++++++++--------- examples/mica/main.c | 346 ++++++++++++++-------- examples/simple-firewall/config.json | 6 +- examples/simple-firewall/main.c | 308 ++++++++++++-------- examples/unit-tests/backpressure.c | 386 +++++++++++++++---------- examples/unit-tests/correctness.c | 11 +- 6 files changed, 940 insertions(+), 527 deletions(-) diff --git a/examples/maglev/main.c b/examples/maglev/main.c index 421ff11..1a482f4 100644 --- a/examples/maglev/main.c +++ b/examples/maglev/main.c @@ -19,7 +19,7 @@ #define PROTO_STRLEN 4 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -41,15 +41,24 @@ struct appconf { int cpu_start; int cpu_end; int stats_cpu; + int srv_port; + int bkd_port; + uint8_t mac_addr[6]; } app_conf; -struct Args { - int socket_id; - int *next; - int next_size; +// clang-format off +static const char *maglev_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-S \tSet MAC address (default: 11:22:33:44:55:66)", + "-p \tService port (default: 80)", + "-P \tBackend port (default: 80)", + NULL }; +// clang-format on -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -58,12 +67,27 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; + app_conf->srv_port = 80; + app_conf->bkd_port = 80; + + int ethaddr[6]; + ethaddr[0] = 0x11; + ethaddr[1] = 0x22; + ethaddr[2] = 0x33; + ethaddr[3] = 0x44; + ethaddr[4] = 0x55; + ethaddr[5] = 0x66; + for (int i = 0; i < 6; i++) + app_conf->mac_addr[i] = (uint8_t)ethaddr[i]; argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:S:p:P:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -73,32 +97,26 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int case 's': app_conf->stats_cpu = atoi(optarg); break; - default: - abort(); - } -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); + case 'S': + if (sscanf(optarg, "%x:%x:%x:%x:%x:%x", ðaddr[0], ðaddr[1], ðaddr[2], ðaddr[3], ðaddr[4], + ðaddr[5]) != 6) { + log_error("Invalid MAC address format: %s", optarg); + return -1; } + for (int i = 0; i < 6; i++) + app_conf->mac_addr[i] = (uint8_t)ethaddr[i]; + break; + case 'p': + app_conf->srv_port = atoi(optarg); + break; + case 'P': + app_conf->bkd_port = atoi(optarg); + break; + default: + printf("Usage: %s -h\n", argv[-shift]); + return -1; } - } - return NULL; + return 0; } static void configure(struct maglev *mag, int num_bkds) @@ -183,23 +201,49 @@ struct backend_entry { struct backend_info value; }; -static void load_services(void) +static int load_services(void) { - flash__send_cmd(cfg->uds_sockfd, FLASH__GET_IP_ADDR); - flash__recv_data(cfg->uds_sockfd, srv_addr, INET_ADDRSTRLEN); + int ret; + ret = flash__send_cmd(cfg->uds_sockfd, FLASH__GET_IP_ADDR); + if (ret < 0) { + log_error("Failed to send command to get NF IP address"); + return -1; + } + ret = flash__recv_data(cfg->uds_sockfd, srv_addr, INET_ADDRSTRLEN); + if (ret < 0) { + log_error("Failed to receive NF IP address"); + return -1; + } log_info("NF IP: %s", srv_addr); - flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + ret = flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); + if (ret < 0) { + log_error("Failed to send command to get Backend IP addresses"); + return -1; + } + ret = flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + if (ret < 0) { + log_error("Failed to receive number of backends"); + return -1; + } log_info("Number of Backends: %d", nbackends); + if (nbackends <= 0 || nbackends > MAX_BACKENDS) { + log_error("Invalid number of backends: %d", nbackends); + return -1; + } for (int i = 0; i < nbackends; i++) { - flash__recv_data(cfg->uds_sockfd, bkd_addr[i], INET_ADDRSTRLEN); + log_info("Receiving Backend %d IP address", i); + ret = flash__recv_data(cfg->uds_sockfd, bkd_addr[i], INET_ADDRSTRLEN); + if (ret < 0) { + log_error("Failed to receive Backend IP address %d", i); + return -1; + } log_info("Backend %d IP: %s", i, bkd_addr[i]); } char proto[PROTO_STRLEN]; unsigned srv_port, bkd_port; - uint8_t mac_addr[6]; + uint8_t *mac_addr; struct service_info *srv_info; struct backend_entry *bkd_entry; struct in_addr addr; @@ -209,25 +253,40 @@ static void load_services(void) struct backend_entry *backend_entries; struct hashmap srv_to_index; - hashmap_init(&services, sizeof(struct service_id), sizeof(struct service_info), MAX_SERVICES); - hashmap_init(&backends, sizeof(struct backend_id), sizeof(struct backend_info), MAX_BACKENDS); - hashmap_init(&maglev_tables, sizeof(struct service_id), sizeof(struct maglev), MAX_SERVICES); + if (hashmap_init(&services, sizeof(struct service_id), sizeof(struct service_info), MAX_SERVICES) != 1) { + log_error("ERROR: unable to initialize services hashmap"); + return -1; + } + if (hashmap_init(&backends, sizeof(struct backend_id), sizeof(struct backend_info), MAX_BACKENDS) != 1) { + log_error("ERROR: unable to initialize backends hashmap"); + goto out_1; + } + if (hashmap_init(&maglev_tables, sizeof(struct service_id), sizeof(struct maglev), MAX_SERVICES) != 1) { + log_error("ERROR: unable to initialize maglev tables hashmap"); + goto out_2; + } service_entries = malloc(sizeof(struct service_entry) * nservices); + if (!service_entries) { + log_error("ERROR: unable to allocate memory for service entries"); + goto out_3; + } backend_entries = malloc(sizeof(struct backend_entry) * nbackends); - hashmap_init(&srv_to_index, sizeof(struct service_id), sizeof(int), nservices); - - mac_addr[0] = 0x11; - mac_addr[1] = 0x22; - mac_addr[2] = 0x33; - mac_addr[3] = 0x44; - mac_addr[4] = 0x55; - mac_addr[5] = 0x66; + if (!backend_entries) { + log_error("ERROR: unable to allocate memory for backend entries"); + goto out_4; + } + if (hashmap_init(&srv_to_index, sizeof(struct service_id), sizeof(int), nservices) != 1) { + log_error("ERROR: unable to initialize service to index hashmap"); + goto out_5; + } + + mac_addr = app_conf.mac_addr; // Manually add services and backends // Service 1: UDP from 192.168.1.1:80 to backend 192.168.1.2:8080 // strcpy(srv_addr, "192.168.1.1"); Stored from main fn itself - srv_port = 80; - bkd_port = 8080; + srv_port = app_conf.srv_port; + bkd_port = app_conf.bkd_port; strcpy(proto, "UDP"); for (int index = 0; index < nbackends; index++) { bkd_entry = &backend_entries[index]; @@ -239,7 +298,7 @@ static void load_services(void) inet_aton(bkd_addr[index], &addr); bkd_entry->value.addr = addr.s_addr; bkd_entry->value.port = htons(bkd_port); - __builtin_memcpy(&bkd_entry->value.mac_addr, mac_addr, sizeof(mac_addr)); + __builtin_memcpy(&bkd_entry->value.mac_addr, mac_addr, sizeof(app_conf.mac_addr)); srvindex = hashmap_lookup_elem(&srv_to_index, &bkd_entry->key.service); if (!srvindex) { @@ -249,8 +308,8 @@ static void load_services(void) srv_info = &srv_entry->value; if (hashmap_insert_elem(&srv_to_index, &srv_entry->key, &service_first_free) != 1) { - fprintf(stderr, "ERROR: unable to add service index to hash map\n"); - exit(EXIT_FAILURE); + log_error("ERROR: unable to add service to service to index hashmap"); + goto out_6; } service_first_free++; @@ -263,17 +322,18 @@ static void load_services(void) } for (int i = 0; i < nservices; i++) { - // printf("%u, %u\n", service_entries[i].key.vaddr, (__u32)(service_entries[i].key.vport)); + log_info("Adding service %u:%u proto %u with %u backends", ntohl(service_entries[i].key.vaddr), + ntohs(service_entries[i].key.vport), service_entries[i].key.proto, service_entries[i].value.backends); if (hashmap_insert_elem(&services, &service_entries[i].key, &service_entries[i].value) != 1) { - fprintf(stderr, "ERROR: unable to add service to hash map\n"); - exit(EXIT_FAILURE); + log_error("ERROR: unable to add service to hashmap"); + goto out_6; } } for (int i = 0; i < nbackends; i++) { if (hashmap_insert_elem(&backends, &backend_entries[i].key, &backend_entries[i].value) != 1) { - fprintf(stderr, "ERROR: unable to add backend to hash map\n"); - exit(EXIT_FAILURE); + log_error("ERROR: unable to add backend to hashmap\n"); + goto out_6; } } @@ -283,69 +343,115 @@ static void load_services(void) uint32_t num_bkds = service_entries[i].value.backends; configure(lookup, num_bkds); if (hashmap_insert_elem(&maglev_tables, &service_entries[i].key, lookup) != 1) { - fprintf(stderr, "ERROR: unable to add maglev table to hash map\n"); - exit(EXIT_FAILURE); + log_error("ERROR: unable to add maglev table to hashmap\n"); + goto out_6; } } - printf("Added %u services and %u backends\n", nservices, nbackends); + log_info("Added %d services and %d backends", nservices, nbackends); free(service_entries); free(backend_entries); hashmap_free(&srv_to_index); - return; + return 0; + +out_6: + hashmap_free(&srv_to_index); +out_5: + free(backend_entries); +out_4: + free(service_entries); +out_3: + hashmap_free(&maglev_tables); +out_2: + hashmap_free(&backends); +out_1: + hashmap_free(&services); + return -1; } +struct sock_args { + int socket_id; + int next_size; +}; + static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int i, ret, nfds = 1, nrecv; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct xskvec *xskvecs, *dropvecs, *sendvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, ndrop, wsend, wdrop; + struct sock_args *a = (struct sock_args *)arg; - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + log_debug("Socket ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; + + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; + } + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("Failed to allocate dropvecs array"); + free(xskvecs); + return NULL; + } + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("Failed to allocate sendvecs array"); + free(xskvecs); + free(dropvecs); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; struct hashmap active_sessions; - hashmap_init(&active_sessions, sizeof(struct session_id), sizeof(struct replace_info), MAX_SESSIONS); + ret = hashmap_init(&active_sessions, sizeof(struct session_id), sizeof(struct replace_info), MAX_SESSIONS); + if (ret != 1) { + log_error("ERROR: unable to initialize active sessions hashmap"); + free(xskvecs); + free(dropvecs); + free(sendvecs); + return NULL; + } for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); - struct xskvec *drop[nrecv]; - unsigned int tot_pkt_drop = 0; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + wdrop = 0; + wsend = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; + struct xskvec *xv = &xskvecs[i]; + void *pkt = xv->data; void *pkt_end = pkt + xv->len; + struct ethhdr *eth = pkt; if ((void *)(eth + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("INVALID PACKET Dropping packet: %d", tot_pkt_drop); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: invalid Ethernet frame"); continue; } if (eth->h_proto != htons(ETH_P_IP)) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("INVALID ETH PROTO Dropping packet: %d", tot_pkt_drop); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: not an IP packet"); continue; } struct iphdr *iph = (void *)(eth + 1); if ((void *)(iph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("INVALID IPHDR Dropping packet: %d", tot_pkt_drop); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: invalid IP header"); continue; } @@ -357,8 +463,8 @@ static void *socket_routine(void *arg) case IPPROTO_TCP:; struct tcphdr *tcph = next; if ((void *)(tcph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("INVALID TCPHDR Dropping packet: %d", tot_pkt_drop); + log_error("ERROR: invalid TCP header"); + dropvecs[wdrop++] = xskvecs[i]; continue; } @@ -371,8 +477,8 @@ static void *socket_routine(void *arg) case IPPROTO_UDP:; struct udphdr *udph = next; if ((void *)(udph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("INVALID UDPHDR Dropping packet: %d", tot_pkt_drop); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: invalid UDP header"); continue; } @@ -383,8 +489,8 @@ static void *socket_routine(void *arg) break; default: - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("DEFAULT Dropping packet: %d", tot_pkt_drop); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: not a TCP/UDP packet"); continue; } @@ -407,11 +513,11 @@ static void *socket_routine(void *arg) /* New session, apply load balancing logic */ struct service_id srvid = { .vaddr = iph->daddr, .vport = *dport, .proto = iph->protocol }; - printf("%u, %u, %u\n", srvid.vaddr, (__u32)(srvid.vport), (__u32)(srvid.proto)); struct service_info *srvinfo = hashmap_lookup_elem(&services, &srvid); if (!srvinfo) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("ERROR: missing service --> DROPPING\n"); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: service not found for %u:%u proto %u --> DROPPING", ntohl(srvid.vaddr), + ntohs(srvid.vport), srvid.proto); continue; } @@ -422,8 +528,9 @@ static void *socket_routine(void *arg) }; struct backend_info *bkdinfo = hashmap_lookup_elem(&backends, &bkdid); if (!bkdinfo) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; - log_info("ERROR: missing backend --> DROPPING\n"); + dropvecs[wdrop++] = xskvecs[i]; + log_error("ERROR: backend not found for service %u:%u proto %u and index %u --> DROPPING", + ntohl(srvid.vaddr), ntohs(srvid.vport), srvid.proto, bkdid.index); continue; } @@ -436,7 +543,7 @@ static void *socket_routine(void *arg) __builtin_memcpy(fwd_rep.mac_addr, &bkdinfo->mac_addr, sizeof(fwd_rep.mac_addr)); rep = &fwd_rep; if (hashmap_insert_elem(&active_sessions, &sid, &fwd_rep) != 1) { - fprintf(stderr, "ERROR: unable to add forward session to map\n"); + log_error("ERROR: unable to add forward session to map\n"); goto insert; } @@ -451,7 +558,7 @@ static void *socket_routine(void *arg) sid.saddr = bkdinfo->addr; sid.sport = bkdinfo->port; if (hashmap_insert_elem(&active_sessions, &sid, &bwd_rep) != 1) { - fprintf(stderr, "ERROR: unable to add backward session to map\n"); + log_error("ERROR: unable to add backward session to map\n"); goto insert; } @@ -487,44 +594,66 @@ static void *socket_routine(void *arg) *l4check = csum_fold(csum); xv->options = (rep->bkdindex << 16) | (xv->options & 0xFFFF); - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + if (ndrop != wdrop || nsend != wsend) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(dropvecs); + free(sendvecs); hashmap_free(&active_sessions); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "maglev"; + cfg->app_options = maglev_options; + cfg->done = &done; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) { + log_error("ERROR: Failed to parse command line arguments"); + goto out_cfg; + } + if (parse_app_args(argc, argv, &app_conf, shift) < 0) { + log_error("ERROR: Failed to parse application arguments"); + goto out_cfg; + } + if (flash__configure_nf(&nf, cfg) < 0) { + log_error("ERROR: Failed to configure NF"); + goto out_cfg; + } log_info("Control Plane Setup Done"); - load_services(); + if (load_services() < 0) { + log_error("ERROR: Failed to load services"); + goto out_cfg_close; + } signal(SIGINT, int_exit); signal(SIGTERM, int_exit); @@ -532,50 +661,65 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for socket args"); + goto out_cfg_close; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; - - log_info("2_NEXT_SIZE: %d", args->next_size); + args[i].socket_id = i; + args[i].next_size = nf->next_size; - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + log_info("2_NEXT_SIZE: %d", args[i].next_size); - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(stats_thread); + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } flash__wait(cfg); - flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } \ No newline at end of file diff --git a/examples/mica/main.c b/examples/mica/main.c index e8f2f1f..a44a660 100644 --- a/examples/mica/main.c +++ b/examples/mica/main.c @@ -5,42 +5,34 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include #include -#include #include "./ported-mica/hash.h" #include "./ported-mica/mehcached.h" -#define IP_STRLEN 16 -#define PROTO_STRLEN 4 -#define IFNAME_STRLEN 256 -#define MAX_VALID_SESSIONS 100 - ////// MICA PART /////// #define NUM_KEYS 2000 #define VALUE_SIZE 256 size_t default_keys[NUM_KEYS]; -int keys_index = 0; +int keys_index = NUM_KEYS - 1; char default_value[VALUE_SIZE]; struct mehcached_table table_o; struct mehcached_table *table; ///// MICA END /////// -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -54,16 +46,21 @@ struct appconf { int cpu_start; int cpu_end; int stats_cpu; - int flag; + int num_get_ops; bool sriov; uint8_t *dest_ether_addr_octet; } app_conf; -struct Args { - int socket_id; - int *next; - int next_size; +// clang-format off +static const char *mica_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-o \tFraction of MICA GET operations (default: 0.5)", + "-S \tEnable SR-IOV mode and set dest MAC address", + NULL }; +// clang-format on static int hex2int(char ch) { @@ -117,7 +114,7 @@ static void swap_mac_addresses(void *data) *dst_addr = tmp; } -static void *configure(void) +static int configure(void) { const size_t page_size = 1048576 * 2; const size_t num_numa_nodes = 1; @@ -129,14 +126,17 @@ static void *configure(void) table = &table_o; size_t numa_nodes[] = { (size_t)-1 }; - // mehcached_table_init(table, 1, 1, 256, false, false, false, numa_nodes[0], numa_nodes, MEHCACHED_MTH_THRESHOLD_FIFO); mehcached_table_init(table, (NUM_KEYS + MEHCACHED_ITEMS_PER_BUCKET - 1) / MEHCACHED_ITEMS_PER_BUCKET, 1, NUM_KEYS * /*MEHCACHED_ROUNDUP64*/ (alloc_overhead + 8 + 8), false, false, false, numa_nodes[0], numa_nodes, MEHCACHED_MTH_THRESHOLD_FIFO); - assert(table); - memset(default_value, 'A', 255); - default_value[255] = '\0'; + if (!table) { + log_error("Failed to initialize MICA table"); + return -1; + } + + memset(default_value, 'A', VALUE_SIZE - 1); + default_value[VALUE_SIZE - 1] = '\0'; for (size_t i = 0; i < NUM_KEYS; i++) { size_t key = i; @@ -144,14 +144,16 @@ static void *configure(void) uint64_t key_hash = hash((const uint8_t *)&key, sizeof(key)); if (!mehcached_set(0, table, key_hash, (const uint8_t *)&key, sizeof(key), (const uint8_t *)&default_value, - sizeof(default_value), 0, false)) - assert(false); + sizeof(default_value), 0, false)) { + log_error("Failed to set key %zu in MICA table", key); + return -1; + } } - return NULL; + return 0; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -160,12 +162,16 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->cpu_start = 0; app_conf->cpu_end = 0; app_conf->stats_cpu = 1; + app_conf->num_get_ops = 0.5 * NUM_KEYS; argc -= shift; argv += shift; while ((c = getopt(argc, argv, "c:e:s:o:S:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -176,38 +182,17 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->stats_cpu = atoi(optarg); break; case 'o': - app_conf->flag = atoi(optarg); + app_conf->num_get_ops = atof(optarg) * NUM_KEYS; break; case 'S': app_conf->dest_ether_addr_octet = get_mac_addr(optarg); app_conf->sriov = true; break; default: - abort(); - } -} - -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } + printf("Usage: %s -h\n", argv[-shift]); + return -1; } - } - return NULL; + return 0; } static uint16_t iph_checksum(void *vdata, size_t length) @@ -278,93 +263,161 @@ static uint16_t udph_checksum(struct udphdr *udph, struct iphdr *iph, uint8_t *p return ~sum; } +struct sock_args { + int socket_id; +}; + static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - // free(arg); - log_info("SOCKET_ID: %d", socket_id); - // static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + nfds_t nfds = 1; + int ret; + struct socket *xsk; + struct xskvec *xskvecs, *sendvecs, *dropvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, wdrop, wsend, ndrop; + struct sock_args *a = (struct sock_args *)arg; - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + xsk = nf->thread[a->socket_id]->socket; - fds[0].fd = nf->thread[socket_id]->socket->fd; + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("Failed to allocate xskvecs array"); + return NULL; + } + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("Failed to allocate dropvecs array"); + free(xskvecs); + return NULL; + } + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("Failed to allocate sendvecs array"); + free(xskvecs); + free(dropvecs); + return NULL; + } + + fds[0].fd = xsk->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); - struct xskvec *drop[nrecv]; - unsigned int tot_pkt_drop = 0; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + wsend = 0; + wdrop = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; + struct xskvec *xv = &xskvecs[i]; void *pkt = xv->data; void *pkt_end = pkt + xv->len; struct in_addr tmp_ip; struct ethhdr *eth = pkt; + if ((void *)(eth + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Dropping packet: incomplete Ethernet header"); continue; } if (eth->h_proto != htons(ETH_P_IP)) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + log_error("Dropping packet: not an IP packet"); + dropvecs[wdrop++] = xskvecs[i]; continue; } struct iphdr *iph = (void *)(eth + 1); if ((void *)(iph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + log_error("Dropping packet: incomplete IP header"); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } + + size_t hdrsize = iph->ihl * 4; + /* Sanity check packet field is valid */ + if (hdrsize < sizeof(*iph)) { + log_error("Dropping packet: invalid IP header length"); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } + + if (iph->protocol != IPPROTO_UDP) { + log_error("Dropping packet: not a UDP packet"); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } + + /* Variable-length IPv4 header, need to use byte-based arithmetic */ + if ((void *)iph + hdrsize > pkt_end) { + log_error("Dropping packet: incomplete IP header with options"); + dropvecs[wdrop++] = xskvecs[i]; continue; } void *next = (void *)iph + (iph->ihl << 2); + // Assuming only UDP packets are coming struct udphdr *udph = next; + if ((void *)(udph + 1) > pkt_end) { + log_error("Dropping packet: incomplete UDP header"); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } + unsigned char *payload = (unsigned char *)(udph + 1); int udp_length = ntohs(udph->len); int payload_len = udp_length - sizeof(struct udphdr); + const size_t key_size = sizeof(size_t); + + // if ((size_t)payload_len < key_size + VALUE_SIZE) { + // log_error("Dropping packet: payload too small for key+value"); + // dropvecs[wdrop++] = xskvecs[i]; + // continue; + // } + + // if ((void *)payload + key_size + VALUE_SIZE > pkt_end) { + // log_error("Dropping packet: cannot read full key+value from payload"); + // dropvecs[wdrop++] = xskvecs[i]; + // continue; + // } + size_t key; - char value[256]; + char value[VALUE_SIZE]; - // get key memcpy(&key, payload, sizeof(size_t)); - // Hardcoding so that half the packets are get, other half are store - key = default_keys[keys_index]; + + // use the key from the default set, ignoring the one in the packet keys_index = (keys_index + 1) % NUM_KEYS; + key = default_keys[keys_index]; // GET - if (app_conf.flag == 0) { + if (keys_index < app_conf.num_get_ops) { uint64_t key_hash = hash((const uint8_t *)&key, sizeof(key)); size_t value_length = sizeof(value); - if (mehcached_get(0, table, key_hash, (const uint8_t *)&key, sizeof(key), (uint8_t *)&value, - &value_length, NULL, false)) - assert(value_length == sizeof(value)); + if (!mehcached_get(0, table, key_hash, (const uint8_t *)&key, sizeof(key), (uint8_t *)&value, + &value_length, NULL, false)) { + log_error("Failed to get key %zu from MICA table", key); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } - // send value - // memcpy(payload + sizeof(size_t), &value, 256); - // // re-configuring the pkt to send - // memcpy(tmp_mac, eth->h_dest, ETH_ALEN); - // memcpy(eth->h_dest, eth->h_source, ETH_ALEN); - // memcpy(eth->h_source, tmp_mac, ETH_ALEN); - - if (app_conf.sriov) { - swap_mac_addresses(pkt); - update_dest_mac(pkt); + if (value_length != sizeof(value)) { + log_error("Value length mismatch for key %zu: expected %zu, got %zu", key, sizeof(value), + value_length); + dropvecs[wdrop++] = xskvecs[i]; + continue; } + // send value + memcpy(payload + sizeof(size_t), &value, VALUE_SIZE); + + app_conf.sriov ? update_dest_mac(pkt) : swap_mac_addresses(pkt); + memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); @@ -380,101 +433,144 @@ static void *socket_routine(void *arg) // Recalculate UDP checksum udph->check = 0; // Must set to 0 before computing checksum udph->check = udph_checksum(udph, iph, payload, payload_len); - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; } // STORE else { - // memcpy(value, payload + sizeof(size_t), 256); - memset(value, 'A', 255); - value[255] = '\0'; + memcpy(value, payload + key_size, VALUE_SIZE); + memset(value, 'A', VALUE_SIZE - 1); + value[VALUE_SIZE - 1] = '\0'; uint64_t key_hash = hash((const uint8_t *)&key, sizeof(key)); if (!mehcached_set(0, table, key_hash, (const uint8_t *)&key, sizeof(key), (const uint8_t *)&value, - sizeof(value), 0, true)) - assert(false); + sizeof(value), 0, true)) { + log_error("Failed to set key %zu in MICA table", key); + dropvecs[wdrop++] = xskvecs[i]; + continue; + } - // send acknowledgement - // memcpy(payload + sizeof(size_t), &value, 256); - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; } } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (ndrop != wdrop || nsend != wsend) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(dropvecs); + free(sendvecs); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "MICA Application"; + cfg->app_options = mica_options; + cfg->done = &done; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) { + log_error("ERROR: Failed to parse command line arguments"); + goto out_cfg; + } + if (parse_app_args(argc, argv, &app_conf, shift) < 0) { + log_error("ERROR: Failed to parse application arguments"); + goto out_cfg; + } + if (flash__configure_nf(&nf, cfg) < 0) { + log_error("ERROR: Failed to configure NF"); + goto out_cfg; + } log_info("Control Plane Setup Done"); - configure(); + if (configure() < 0) { + log_error("ERROR: Failed to configure MICA"); + goto out_cfg; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); log_info("STARTING Data Path"); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } + for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args[i].socket_id = i; - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s\n", strerror(errno)); + goto out_args; + } } + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; + } + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach stats thread: %s\n", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); flash__wait(cfg); - flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; + exit(EXIT_SUCCESS); + +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } \ No newline at end of file diff --git a/examples/simple-firewall/config.json b/examples/simple-firewall/config.json index dd3fe84..48b04ee 100644 --- a/examples/simple-firewall/config.json +++ b/examples/simple-firewall/config.json @@ -1,7 +1,7 @@ { "valid_src": [ - {"src_addr": "192.168.1.1", "src_port": 3000}, - {"src_addr": "192.168.1.2", "src_port": 3000}, - {"src_addr": "192.168.1.3", "src_port": 3000} + {"src_addr": "192.168.1.1", "src_port": 1234}, + {"src_addr": "192.168.1.2", "src_port": 1234}, + {"src_addr": "192.168.1.3", "src_port": 1234} ] } diff --git a/examples/simple-firewall/main.c b/examples/simple-firewall/main.c index ad8ace9..1a43a6a 100644 --- a/examples/simple-firewall/main.c +++ b/examples/simple-firewall/main.c @@ -23,14 +23,14 @@ #include #include -#define CONFIG_FILE "./examples/firewall/config.json" +#define CONFIG_FILE "./examples/simple-firewall/config.json" #define IP_STRLEN 16 #define PROTO_STRLEN 4 #define IFNAME_STRLEN 256 #define MAX_VALID_SESSIONS 100 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; @@ -46,11 +46,14 @@ struct appconf { int stats_cpu; } app_conf; -struct Args { - int socket_id; - int *next; - int next_size; +// clang-format off +static const char *firewall_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + NULL }; +// clang-format on struct session_id { uint32_t saddr; @@ -67,14 +70,14 @@ int num_sessions = 0; char load_balancer_addr[IP_STRLEN]; unsigned int load_balancer_port = 80; -static void read_json_config(void) +static int read_json_config(void) { struct in_addr addr; inet_aton(load_balancer_addr, &addr); FILE *file = fopen(CONFIG_FILE, "r"); if (!file) { - perror("Failed to open file"); - return; + log_error("Failed to open file: %s", CONFIG_FILE); + return -1; } // Get file size @@ -84,17 +87,17 @@ static void read_json_config(void) char *json_data = (char *)malloc(file_size + 1); if (!json_data) { - perror("Memory allocation failed"); + log_error("Memory allocation failed for JSON data"); fclose(file); - return; + return -1; } size_t read_size = fread(json_data, 1, file_size, file); if (read_size != file_size) { - perror("Failed to read entire file"); + log_error("Failed to read entire file: %s", CONFIG_FILE); free(json_data); fclose(file); - exit(1); + return -1; } json_data[file_size] = '\0'; @@ -103,8 +106,8 @@ static void read_json_config(void) cJSON *json = cJSON_Parse(json_data); free(json_data); if (!json) { - printf("Error parsing JSON\n"); - return; + log_error("Error parsing JSON"); + return -1; } cJSON *valid_src = cJSON_GetObjectItem(json, "valid_src"); @@ -113,8 +116,9 @@ static void read_json_config(void) int size = cJSON_GetArraySize(valid_src); num_sessions = size; if (num_sessions > MAX_VALID_SESSIONS) { - printf("num_sessions > MAX_VALID_SESSIONS\n"); - exit(1); + log_error("Number of sessions (%d) exceeds maximum allowed (%d)", num_sessions, MAX_VALID_SESSIONS); + cJSON_Delete(json); // Clean up + return -1; } for (int i = 0; i < num_sessions; i++) { cJSON *entry = cJSON_GetArrayItem(valid_src, i); @@ -127,38 +131,67 @@ static void read_json_config(void) valid_sessions[i].proto = IPPROTO_UDP; valid_sessions[i].daddr = addr.s_addr; valid_sessions[i].dport = htons(load_balancer_port); + log_info("Valid session added: %s:%d -> %s:%d (proto: %d)", src_addr->valuestring, src_port->valueint, + load_balancer_addr, load_balancer_port, valid_sessions[i].proto); } } } else { - printf("Error: valid_src is not a valid array\n"); + log_error("Error: valid_src is not a valid array"); + cJSON_Delete(json); // Clean up + return -1; } cJSON_Delete(json); // Clean up + return 0; } -static void *configure(void) +static int configure(void) { - int nbackends; - flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); - flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + int nbackends, ret; + ret = flash__send_cmd(cfg->uds_sockfd, FLASH__GET_DST_IP_ADDR); + if (ret < 0) { + log_error("Failed to send command to UDS socket"); + return -1; + } + ret = flash__recv_data(cfg->uds_sockfd, &nbackends, sizeof(int)); + if (ret < 0) { + log_error("Failed to receive data from UDS socket"); + return -1; + } if (nbackends != 1) { - printf("Firewall is linked to %d load balancers", nbackends); - exit(1); + log_error("Firewall is linked to %d load balancers", nbackends); + return -1; + } + ret = flash__recv_data(cfg->uds_sockfd, load_balancer_addr, INET_ADDRSTRLEN); + if (ret < 0) { + log_error("Failed to receive data from UDS socket"); + return -1; } - flash__recv_data(cfg->uds_sockfd, load_balancer_addr, INET_ADDRSTRLEN); - read_json_config(); + ret = read_json_config(); + if (ret < 0) { + log_error("Failed to read JSON config"); + return -1; + } - hashmap_init(&valid_sessions_map, sizeof(struct session_id), sizeof(int), MAX_VALID_SESSIONS); + ret = hashmap_init(&valid_sessions_map, sizeof(struct session_id), sizeof(int), MAX_VALID_SESSIONS); + if (ret != 1) { + log_error("ERROR: unable to initialize valid sessions hashmap"); + return -1; + } for (int session_num = 0; session_num < num_sessions; session_num++) { struct session_id *key = &valid_sessions[session_num]; int val = 1; - hashmap_insert_elem(&valid_sessions_map, (void *)key, (void *)&val); + ret = hashmap_insert_elem(&valid_sessions_map, (void *)key, (void *)&val); + if (ret != 1) { + log_error("ERROR: unable to add valid session to hashmap"); + return -1; + } } - return NULL; + return 0; } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -171,8 +204,11 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int argc -= shift; argv += shift; - while ((c = getopt(argc, argv, "c:e:s:")) != -1) + while ((c = getopt(argc, argv, "hc:e:s:")) != -1) switch (c) { + case 'h': + printf("Usage: %s -h\n", argv[-shift]); + return -1; case 'c': app_conf->cpu_start = atoi(optarg); break; @@ -183,88 +219,85 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->stats_cpu = atoi(optarg); break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + return 0; } -static void *worker__stats(void *arg) -{ - (void)arg; - - if (cfg->verbose) { - unsigned int interval = cfg->stats_interval; - setlocale(LC_ALL, ""); - - for (int i = 0; i < cfg->total_sockets; i++) - nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); - - while (!done) { - sleep(interval); - if (system("clear") != 0) - log_error("Terminal clear error"); - for (int i = 0; i < cfg->total_sockets; i++) { - flash__dump_stats(cfg, nf->thread[i]->socket); - } - } - } - return NULL; -} +struct sock_args { + int socket_id; +}; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; + int ret; + nfds_t nfds = 1; + struct socket *xsk; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + struct pollfd fds[1] = {}; + uint32_t i, nrecv, nsend, ndrop, wdrop, wsend; + struct sock_args *a = (struct sock_args *)arg; + int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - // free(arg); + + xsk = nf->thread[socket_id]->socket; log_info("SOCKET_ID: %d", socket_id); - // static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; - struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; - log_info("2_NEXT_SIZE: %d", next_size); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; + } - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("ERROR: Memory allocation failed for sendvecs"); + free(xskvecs); + return NULL; } - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("ERROR: Memory allocation failed for dropvecs"); + free(xskvecs); + free(sendvecs); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; + fds[0].fd = xsk->fd; fds[0].events = POLLIN; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret <= 0 || ret > 1) - continue; - } - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); - - struct xskvec *drop[nrecv]; - unsigned int tot_pkt_drop = 0; - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + wsend = 0; + wdrop = 0; for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; + struct xskvec *xv = &xskvecs[i]; void *pkt = xv->data; void *pkt_end = pkt + xv->len; + struct ethhdr *eth = pkt; if ((void *)(eth + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Packet too short for Ethernet header"); continue; } if (eth->h_proto != htons(ETH_P_IP)) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Unsupported Ethernet protocol"); continue; } struct iphdr *iph = (void *)(eth + 1); if ((void *)(iph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Packet too short for IP header"); continue; } @@ -276,7 +309,8 @@ static void *socket_routine(void *arg) case IPPROTO_TCP:; struct tcphdr *tcph = next; if ((void *)(tcph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Packet too short for TCP header"); continue; } @@ -288,7 +322,8 @@ static void *socket_routine(void *arg) case IPPROTO_UDP:; struct udphdr *udph = next; if ((void *)(udph + 1) > pkt_end) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Packet too short for UDP header"); continue; } @@ -298,7 +333,8 @@ static void *socket_routine(void *arg) break; default: - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; + log_error("Unsupported IP protocol: %d", iph->protocol); continue; } @@ -310,94 +346,126 @@ static void *socket_routine(void *arg) sid.dport = *dport; if (hashmap_lookup_elem(&valid_sessions_map, (void *)&sid) == NULL) { - drop[tot_pkt_drop++] = &msg.msg_iov[i]; + dropvecs[wdrop++] = xskvecs[i]; continue; } - send[tot_pkt_send++] = &msg.msg_iov[i]; + sendvecs[wsend++] = xskvecs[i]; } if (nrecv) { - size_t ret_send = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - size_t ret_drop = flash__olddropmsg(cfg, nf->thread[socket_id]->socket, drop, tot_pkt_drop); - if (ret_send != tot_pkt_send || ret_drop != tot_pkt_drop) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (nsend != wsend || ndrop != wdrop) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(sendvecs); + free(dropvecs); + hashmap_free(&valid_sessions_map); return NULL; } int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "simple-firewall"; + cfg->app_options = firewall_options; + cfg->done = &done; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; log_info("Control Plane Setup Done"); - configure(); + if (configure() < 0) { + log_error("Error configuring the application"); + goto out_cfg; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); log_info("STARTING Data Path"); + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args[i].socket_id = i; - log_info("2_NEXT_SIZE: %d", args->next_size); - - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } - - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, flash__stats_thread, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; + } + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); flash__wait(cfg); - flash__xsk_close(cfg, nf); - return EXIT_SUCCESS; -} \ No newline at end of file + exit(EXIT_SUCCESS); +out_args: + done = true; + free(args); +out_cfg_close: + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); +} diff --git a/examples/unit-tests/backpressure.c b/examples/unit-tests/backpressure.c index 8e56341..d5f91f7 100644 --- a/examples/unit-tests/backpressure.c +++ b/examples/unit-tests/backpressure.c @@ -20,7 +20,7 @@ #define TEST_PORT 8080 -bool done = false; +volatile bool done = false; struct config *cfg = NULL; struct nf *nf; struct test_stats *stats_arr; @@ -32,21 +32,27 @@ static void int_exit(int sig) } struct testHeader { - __u8 lastHop; - __u8 hopCount; - __u64 pktId; - __u16 old_dst; + uint8_t lastHop; + uint8_t hopCount; + uint64_t pktId; + uint16_t old_dst; + int sender_nf_id; + int sender_next_size; }; +#define MAX_NFS 16 +struct nf_info { + int sender_next_size; + bool first_packet_received; + uint64_t expected_mod_value; + uint64_t next_expected_pkt_id; +} nf_info_arr[MAX_NFS] = { 0 }; + struct test_stats { - __u64 pkt_count; - __u64 even_next; // Next expected even packet ID - __u64 odd_next; // Next expected odd packet ID - __u64 pkt_dropped; - __u64 pkt_corrupted; - __u64 pkt_correct; - __u64 even; - __u64 odd; + uint64_t pkt_count; + uint64_t pkt_dropped; + uint64_t pkt_corrupted; + uint64_t pkt_correct; }; struct appconf { @@ -107,7 +113,21 @@ static void burn_cycles(__u64 cycles_to_burn) } } -static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +// clang-format off +static const char *backpressure_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-h \tNumber of hops (default: 1)", + "-B \tBurn cycles (default: 10000000)", + "-v\t\tEnable variable-length packets (default: disabled)", + "-a \tVariable start value (default: 0)", + "-z \tVariable end value (default: 0)", + NULL +}; +// clang-format on + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { int c; opterr = 0; @@ -152,8 +172,10 @@ static void parse_app_args(int argc, char **argv, struct appconf *app_conf, int app_conf->variable_end = atoi(optarg); break; default: - abort(); + printf("Usage: %s -h\n", argv[-shift]); + return -1; } + return 0; } static void process_packets(void *data, __u32 *len, struct test_stats *stats) @@ -163,11 +185,13 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) struct ethhdr *eth = (struct ethhdr *)pos; if ((void *)(eth + 1) > data_end) { + log_error("Ethernet header is not valid"); stats->pkt_dropped++; return; } if (eth->h_proto != htons(ETH_P_IP)) { + log_error("Ethernet protocol is not IP"); stats->pkt_dropped++; return; } @@ -178,6 +202,7 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) size_t hdrsize; if ((void *)iph + 1 > data_end) { + log_error("IP header is not valid"); stats->pkt_dropped++; return; } @@ -185,17 +210,20 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) hdrsize = iph->ihl * 4; /* Sanity check packet field is valid */ if (hdrsize < sizeof(*iph)) { + log_error("IP header size is invalid"); stats->pkt_dropped++; return; } if (iph->protocol != IPPROTO_UDP) { + log_error("IP protocol is not UDP"); stats->pkt_dropped++; return; } /* Variable-length IPv4 header, need to use byte-based arithmetic */ if (pos + hdrsize > data_end) { + log_error("IP header is not valid"); stats->pkt_dropped++; return; } @@ -206,6 +234,7 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) struct udphdr *udphdr = pos; if ((void *)udphdr + 1 > data_end) { + log_error("UDP header is not valid"); stats->pkt_dropped++; return; } @@ -214,155 +243,188 @@ static void process_packets(void *data, __u32 *len, struct test_stats *stats) payload_len = ntohs(udphdr->len) - sizeof(struct udphdr); size_t testHeaderLen = sizeof(struct testHeader); + void *payload_end = pos + payload_len; + + struct testHeader *testHeader = NULL; /* First NF */ if (ntohs(udphdr->dest) != TEST_PORT) { - // Shift the data to add the test header. Can we do this without memmove?? - memmove(pos + testHeaderLen, pos, payload_len); - - // Add test header and update the old length - struct testHeader *testHeader = pos; + // Append test header at the end of the UDP payload + testHeader = (struct testHeader *)payload_end; testHeader->lastHop = app_conf.hops; testHeader->hopCount = 1; - testHeader->pktId = stats->pkt_count++; + testHeader->old_dst = udphdr->dest; + *len += testHeaderLen; + udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); + iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); - // update the udp header - testHeader->old_dst = udphdr->dest; udphdr->dest = htons(TEST_PORT); - udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); - // update the ip payload length - iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); + stats->pkt_correct++; } else { - struct testHeader *testHeader = pos; + // check if the test header is present + if (payload_len < testHeaderLen) { + stats->pkt_dropped++; + log_error("ERROR: Test header not found in packet"); + return; + } + + // testHeader is at the end of the UDP payload + testHeader = (struct testHeader *)(payload_end - testHeaderLen); testHeader->hopCount++; - if (testHeader->pktId % 2 == 0) { // Even packet - if (testHeader->pktId != stats->even_next) { - if (testHeader->pktId < stats->even_next) { - stats->pkt_corrupted++; - stats->even_next = testHeader->pktId + 2; - } else { - stats->pkt_corrupted++; - stats->even_next = testHeader->pktId + 2; - } - } else { - stats->even++; - stats->pkt_correct++; - stats->even_next += 2; + uint64_t received_pktId = testHeader->pktId; + int sender_nf_id = testHeader->sender_nf_id; + int sender_next_size = testHeader->sender_next_size; + + if (sender_nf_id < 0 || sender_nf_id >= MAX_NFS) { + log_error("ERROR: Invalid sender NF ID %d", sender_nf_id); + stats->pkt_corrupted++; + goto test_header_update; + } + if (sender_next_size <= 0) { + log_error("ERROR: Invalid sender next size %d", sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; + } + + struct nf_info *sender_info = &nf_info_arr[sender_nf_id]; + + if (!sender_info->first_packet_received) { + sender_info->first_packet_received = true; + sender_info->sender_next_size = sender_next_size; + sender_info->expected_mod_value = received_pktId % sender_next_size; + sender_info->next_expected_pkt_id = received_pktId + sender_next_size; + stats->pkt_correct++; // first packet is always correct + } else { + if (sender_next_size != sender_info->sender_next_size) { + log_error("ERROR: nf_next_size mismatch for NF ID %d: expected %d, got %d", sender_nf_id, + sender_info->sender_next_size, sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; + } + + if (received_pktId % sender_next_size != sender_info->expected_mod_value) { + log_error("ERROR: pktId %% sender_next_size mismatch for NF ID %d: expected %lu, got %lu", + sender_nf_id, sender_info->expected_mod_value, received_pktId % sender_next_size); + stats->pkt_corrupted++; + goto test_header_update; } - } else { // Odd packet - if (testHeader->pktId != stats->odd_next) { - if (testHeader->pktId < stats->odd_next) { + + uint64_t next_expected_pkt_id = sender_info->next_expected_pkt_id; + + if (received_pktId != next_expected_pkt_id) { + if (received_pktId < next_expected_pkt_id) { + log_error("ERROR: Received pktId %lu is less than expected %lu for NF ID %d", received_pktId, + next_expected_pkt_id, sender_nf_id); stats->pkt_corrupted++; - stats->odd_next = testHeader->pktId + 2; } else { - stats->pkt_corrupted++; - stats->odd_next = testHeader->pktId + 2; + sender_info->next_expected_pkt_id = received_pktId + sender_next_size; + log_info("Received pktId %lu, updating next_expected_pkt_id to %lu for NF ID %d", + received_pktId, sender_info->next_expected_pkt_id, sender_nf_id); + stats->pkt_dropped += (received_pktId - next_expected_pkt_id) / sender_next_size; } } else { - stats->odd++; + sender_info->next_expected_pkt_id += sender_next_size; stats->pkt_correct++; - stats->odd_next += 2; } } + } - if (testHeader->lastHop == testHeader->hopCount) { - uint8_t tmp_mac[ETH_ALEN]; - struct in_addr tmp_ip; - unsigned short tmp_port; - payload_len -= testHeaderLen; +test_header_update: + testHeader->pktId = stats->pkt_count++; + testHeader->sender_nf_id = cfg->nf_id; + testHeader->sender_next_size = nf->next_size; - tmp_port = testHeader->old_dst; + if (testHeader->lastHop == testHeader->hopCount) { + uint8_t tmp_mac[ETH_ALEN]; + struct in_addr tmp_ip; + unsigned short tmp_port; + payload_len -= testHeaderLen; - // Shift the data to remove the test header - memmove(pos, pos + testHeaderLen, payload_len); + tmp_port = testHeader->old_dst; - // update the udp header - udphdr->dest = tmp_port; - udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); - *len -= testHeaderLen; + udphdr->dest = tmp_port; + udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); + *len -= testHeaderLen; - tmp_port = udphdr->dest; - udphdr->dest = udphdr->source; - udphdr->source = tmp_port; + tmp_port = udphdr->dest; + udphdr->dest = udphdr->source; + udphdr->source = tmp_port; - // update the ip payload length - iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); + iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); - memcpy(tmp_mac, eth->h_dest, ETH_ALEN); - memcpy(eth->h_dest, eth->h_source, ETH_ALEN); - memcpy(eth->h_source, tmp_mac, ETH_ALEN); + memcpy(tmp_mac, eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, eth->h_source, ETH_ALEN); + memcpy(eth->h_source, tmp_mac, ETH_ALEN); - memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); - memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); - memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); - } + memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); + memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); + memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); } return; } -struct Args { +struct sock_args { int socket_id; - int *next; int next_size; }; static void *socket_routine(void *arg) { - struct Args *a = (struct Args *)arg; - int socket_id = a->socket_id; - int *next = a->next; - int next_size = a->next_size; - log_info("SOCKET_ID: %d", socket_id); - static __u32 nb_frags; - int i, ret, nfds = 1, nrecv; + nfds_t nfds = 1; + int ret, next_size; + struct socket *xsk; + struct xskvec *xskvecs, *sendvecs; struct pollfd fds[1] = {}; - struct xskmsghdr msg = {}; + uint32_t i, nrecv, nsend, count, nb_frags = 0, wsend; + struct sock_args *a = (struct sock_args *)arg; - log_info("2_NEXT_SIZE: %d", next_size); + next_size = a->next_size; - for (int i = 0; i < next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, next[i]); - } + log_debug("SOCKET_ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; cfg->xsk->poll_timeout = -1; - msg.msg_iov = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; + } - fds[0].fd = nf->thread[socket_id]->socket->fd; - fds[0].events = POLLIN; + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("ERROR: Memory allocation failed for sendvecs"); + free(xskvecs); + return NULL; + } - nf->thread[socket_id]->socket->idle_fd.fd = nf->thread[socket_id]->socket->fd; - nf->thread[socket_id]->socket->idle_fd.events = POLLIN; + fds[0].fd = xsk->fd; + fds[0].events = POLLIN; - unsigned int count = 0; + count = 0; for (;;) { - if (cfg->xsk->mode & FLASH__POLL) { - ret = flash__oldpoll(nf->thread[socket_id]->socket, fds, nfds, cfg->xsk->poll_timeout); - if (ret != 1) - continue; - } + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; - nrecv = flash__oldrecvmsg(cfg, nf->thread[socket_id]->socket, &msg); + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); - struct xskvec *send[nrecv]; - unsigned int tot_pkt_send = 0; - for (i = 0; i < nrecv; i++) { - struct xskvec *xv = &msg.msg_iov[i]; - bool eop = IS_EOP_DESC(xv->options); + wsend = 0; + for (i = 0; i < nrecv; i++) { if (next_size != 0) { - xv->options = ((count % next_size) << 16) | (xv->options & 0xFFFF); + xskvecs[i].options = ((count % next_size) << 16) | (xskvecs[i].options & 0xFFFF); count++; } - char *pkt = xv->data; + char *pkt = xskvecs[i].data; if (!nb_frags++) - process_packets(pkt, &xv->len, &stats_arr[socket_id]); + process_packets(pkt, &xskvecs[i].len, &stats_arr[a->socket_id]); if (app_conf.variable) { __u64 random_cycles = @@ -373,29 +435,33 @@ static void *socket_routine(void *arg) burn_cycles(app_conf.burn_cycles); } - send[tot_pkt_send++] = &msg.msg_iov[i]; - if (eop) + sendvecs[wsend++] = xskvecs[i]; + + if (IS_EOP_DESC(xskvecs[i].options)) nb_frags = 0; } if (nrecv) { - ret = flash__oldsendmsg(cfg, nf->thread[socket_id]->socket, send, tot_pkt_send); - if (ret != nrecv) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + if (nsend != nrecv) { log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); + break; } } if (done) break; } - free(msg.msg_iov); + free(xskvecs); + free(sendvecs); return NULL; } -static void *worker__stats(void *arg) +static void *worker__stats(void *conf) { - (void)arg; + struct stats_conf *arg = (struct stats_conf *)conf; + struct nf *nf = arg->nf; + struct config *cfg = arg->cfg; if (cfg->verbose) { unsigned int interval = cfg->stats_interval; @@ -410,11 +476,9 @@ static void *worker__stats(void *arg) log_error("Terminal clear error"); for (int i = 0; i < cfg->total_sockets; i++) { flash__dump_stats(cfg, nf->thread[i]->socket); - printf("%-18s %'-14llu\n", "dropped", stats_arr[i].pkt_dropped); - printf("%-18s %'-14llu\n", "corrupt", stats_arr[i].pkt_corrupted); - printf("%-18s %'-14llu\n", "correct", stats_arr[i].pkt_correct); - printf("%-18s %'-14llu\n", "even", stats_arr[i].even); - printf("%-18s %'-14llu\n", "odd", stats_arr[i].odd); + printf("%-18s %'-14lu\n", "dropped", stats_arr[i].pkt_dropped); + printf("%-18s %'-14lu\n", "corrupt", stats_arr[i].pkt_corrupted); + printf("%-18s %'-14lu\n", "correct", stats_arr[i].pkt_correct); } } } @@ -423,21 +487,37 @@ static void *worker__stats(void *arg) int main(int argc, char **argv) { + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + cfg = calloc(1, sizeof(struct config)); if (!cfg) { log_error("ERROR: Memory allocation failed\n"); exit(EXIT_FAILURE); } - int n = flash__parse_cmdline_args(argc, argv, cfg); - parse_app_args(argc, argv, &app_conf, n); - flash__configure_nf(&nf, cfg); - flash__populate_fill_ring(nf->thread, cfg->umem->frame_size, cfg->total_sockets, cfg->umem_offset, cfg->umem_scale); + cfg->app_name = "Backpressure Application"; + cfg->app_options = backpressure_options; + cfg->done = &done; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; stats_arr = calloc(cfg->total_sockets, sizeof(struct test_stats)); - stats_arr->even_next = 0; - stats_arr->odd_next = 1; + if (!stats_arr) { + log_error("ERROR: Memory allocation failed for stats_arr"); + goto out_cfg; + } log_info("Control Plane Setup Done"); @@ -447,49 +527,67 @@ int main(int argc, char **argv) log_info("STARTING Data Path"); - for (int i = 0; i < cfg->total_sockets; i++) { - struct Args *args = calloc(1, sizeof(struct Args)); - args->socket_id = i; - args->next = nf->next; - args->next_size = nf->next_size; + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } - log_info("2_NEXT_SIZE: %d", args->next_size); + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; + args[i].next_size = nf->next_size; - for (int i = 0; i < args->next_size; i++) { - log_info("2_NEXT_ITEM_%d %d", i, nf->next[i]); - } + log_info("2_NEXT_SIZE: %d", args[i].next_size); - pthread_t socket_thread; - if (pthread_create(&socket_thread, NULL, socket_routine, args)) { + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { log_error("Error creating socket thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; } - pthread_detach(socket_thread); + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } } - pthread_t stats_thread; - if (pthread_create(&stats_thread, NULL, worker__stats, NULL)) { + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, worker__stats, &stats_cfg)) { log_error("Error creating statistics thread"); - exit(EXIT_FAILURE); + goto out_args; } CPU_ZERO(&cpuset); CPU_SET(app_conf.stats_cpu, &cpuset); if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); - exit(EXIT_FAILURE); + goto out_args; + } + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; } - pthread_detach(stats_thread); flash__wait(cfg); flash__xsk_close(cfg, nf); return EXIT_SUCCESS; + +out_args: + done = true; + free(args); +out_cfg_close: + free(stats_arr); + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); } diff --git a/examples/unit-tests/correctness.c b/examples/unit-tests/correctness.c index e5eebe0..1f212d1 100644 --- a/examples/unit-tests/correctness.c +++ b/examples/unit-tests/correctness.c @@ -63,8 +63,15 @@ struct appconf { int hops; } app_conf; -static const char *correctness_options[] = { "-c \tStart CPU (default: 0)", "-e \tEnd CPU (default: 0)", - "-s \tStats CPU (default: 1)", "-h \tNumber of hops (default: 1)", NULL }; +// clang-format off +static const char *correctness_options[] = { + "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-h \tNumber of hops (default: 1)", + NULL +}; +// clang-format on static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) { From 522f147d95fcad57660d2507d22216acd53a0376 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 15 Sep 2025 13:44:24 +0530 Subject: [PATCH 24/43] feat: Added fast_log API - Useful for debugging when large amounts of data need to be written to a file. - Supports creating multiple target files (e.g. one per NF) for debugging output. --- lib/flash/log/log.c | 43 +++++++++++++++++++++++++++++++++++++++++++ lib/flash/log/log.h | 25 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/lib/flash/log/log.c b/lib/flash/log/log.c index 10df2fc..b7f2b67 100644 --- a/lib/flash/log/log.c +++ b/lib/flash/log/log.c @@ -215,3 +215,46 @@ void log_set_level_from_env(void) log_set_level(LOG_INFO); } } + +#ifdef FAST_LOG_TO_FILE + +char fast_log_buffer[FAST_LOG_BATCH_SIZE][FAST_LOG_SIZE]; +int fast_log_index = 0; + +static void fast_log_dump_in_file(int nf_id, int count) +{ + char filename[FAST_LOG_SIZE]; + snprintf(filename, sizeof(filename), FAST_LOG_DIR "nf-%d.log", nf_id); + FILE *fp = fopen(filename, "a"); + if (fp) { + for (int i = 0; i < count; i++) { + fprintf(fp, "%s\n", fast_log_buffer[i]); + } + fclose(fp); + } else { + log_error("Error opening string log file"); + } +} + +void fast_log(int nf_id, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vsnprintf(fast_log_buffer[fast_log_index++], FAST_LOG_SIZE, fmt, args); + va_end(args); + + if (fast_log_index >= FAST_LOG_BATCH_SIZE) { + fast_log_dump_in_file(nf_id, FAST_LOG_BATCH_SIZE); + fast_log_index = 0; + } +} + +void fast_log_flush(int nf_id) +{ + if (fast_log_index > 0) { + fast_log_dump_in_file(nf_id, fast_log_index); + fast_log_index = 0; + } +} + +#endif diff --git a/lib/flash/log/log.h b/lib/flash/log/log.h index f5efdc2..73b543b 100644 --- a/lib/flash/log/log.h +++ b/lib/flash/log/log.h @@ -40,4 +40,29 @@ void log_set_level_from_env(void); void log_log(int level, const char *file, int line, const char *caller, const char *fmt, ...); +#define FAST_LOG_TO_FILE + +#ifdef FAST_LOG_TO_FILE +#define FAST_LOG_BATCH_SIZE 1024 +#define FAST_LOG_SIZE 256 +#define FAST_LOG_DIR "flash_nf_logs/" + +/* Log a string to a file named after the nf_id + The log is buffered and written in batches for efficiency + Call fast_log_flush(nf_id) to flush the buffer to the file + NOTE: Manually clear the file contents before starting logging for a new run +*/ +void fast_log(int nf_id, const char *fmt, ...); +void fast_log_flush(int nf_id); +#else + +#define fast_log(...) \ + do { \ + } while (0) +#define fast_log_flush(...) \ + do { \ + } while (0) + +#endif + #endif /* LOG_H */ From fcd2949b0382488344879c4ac45016ca05cbcc1c Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 15 Sep 2025 13:47:34 +0530 Subject: [PATCH 25/43] feat: new smart poll - Implement backpressure with poll wait instead of sleep. - Updated the __reserve_tx algorithm. - When unable to reserve space in the TX ring, the NF waits until downstream NFs wake it up (enough FQ buffers and free RX ring space available) - Replace periodic sleep flag `p` with flag `s` - Use flag `p` for poll with POLLOUT --- lib/flash/monitor/flash_cfgparser.c | 18 +++++++ lib/flash/monitor/flash_monitor.c | 29 ++++++++-- lib/flash/nf/flash_nf.c | 52 ++++++++++++++++++ lib/flash/nf/flash_stats.c | 2 + lib/flash/nf/flash_txrx.c | 84 ++++++++++++++++++++++++++--- lib/flash/params/flash_params.c | 12 +++-- lib/flash/uds/flash_uds.h | 2 + lib/include/flash_defines.h | 12 +++++ monitor/main.c | 16 ++++++ 9 files changed, 212 insertions(+), 15 deletions(-) diff --git a/lib/flash/monitor/flash_cfgparser.c b/lib/flash/monitor/flash_cfgparser.c index 445b442..47feb8c 100644 --- a/lib/flash/monitor/flash_cfgparser.c +++ b/lib/flash/monitor/flash_cfgparser.c @@ -119,6 +119,9 @@ struct NFGroup *parse_json(const char *filename) nf_group->umem_count = cJSON_GetArraySize(umem_array); nf_group->umem = (struct umem **)calloc(nf_group->umem_count, sizeof(struct umem *)); + // create a 2D prev array to store previous xsk ids for each umem + int *prev[FLASH_MAX_XSK] = { 0 }; + int prev_size[FLASH_MAX_XSK] = { 0 }; // Iterate over each "umem" entry for (int i = 0; i < nf_group->umem_count; i++) { cJSON *umem_obj = cJSON_GetArrayItem(umem_array, i); @@ -287,6 +290,14 @@ struct NFGroup *parse_json(const char *filename) } next[l] = cJSON_GetNumberValue(item); log_info("%d -> %d", nf_group->umem[i]->nf[j]->id, next[l]); + + int u = nf_group->umem[i]->nf[j]->id; + int v = next[l]; + if (prev[v] == NULL) { + prev[v] = (int *)calloc(FLASH_MAX_XSK, sizeof(int)); + prev_size[v] = 0; + } + prev[v][prev_size[v]++] = u; } cJSON *thread_array = cJSON_GetObjectItem(nf_obj, "thread"); @@ -332,6 +343,13 @@ struct NFGroup *parse_json(const char *filename) num_queues += total_threads; } + for (int i = 0; i < nf_group->umem_count; i++) { + for (int j = 0; j < nf_group->umem[i]->nf_count; j++) { + nf_group->umem[i]->nf[j]->prev = prev[nf_group->umem[i]->nf[j]->id]; + nf_group->umem[i]->nf[j]->prev_size = prev_size[nf_group->umem[i]->nf[j]->id]; + } + } + configure_nic(ifname, num_queues, mode); cJSON_Delete(root); diff --git a/lib/flash/monitor/flash_monitor.c b/lib/flash/monitor/flash_monitor.c index 8d0d205..27fd454 100644 --- a/lib/flash/monitor/flash_monitor.c +++ b/lib/flash/monitor/flash_monitor.c @@ -58,6 +58,13 @@ void close_nf(struct umem *umem, int umem_id, int nf_id) umem->cfg->umem->size = 0; } umem->cfg->umem_fd = -1; + close(umem->cfg->nf_pollout_status_fd); + if (umem->cfg->nf_pollout_status) { + munmap((void *)(uintptr_t)umem->cfg->nf_pollout_status, umem->cfg->nf_pollout_status_size); + umem->cfg->nf_pollout_status = NULL; + umem->cfg->nf_pollout_status_size = 0; + } + umem->cfg->nf_pollout_status_fd = -1; free(umem->cfg->umem_config); free(umem->cfg->xsk_config); free(umem->umem_info); @@ -141,11 +148,11 @@ const char *process_input(char *input) return NULL; } -static int create_umem_fd(size_t size) +static int create_memfd(const char *name, size_t size) { int fd, ret; - fd = memfd_create("UMEM0", MFD_ALLOW_SEALING); + fd = memfd_create(name, MFD_ALLOW_SEALING); if (fd == -1) exit(1); @@ -221,7 +228,7 @@ static void flash__setup_umem(struct umem *umem) log_info("UMEM size: %lu", size); - fd = create_umem_fd(size); + fd = create_memfd("UMEM0", size); flags = MAP_SHARED; /* Reserve memory for the umem. Use hugepages if unaligned chunk mode is enabled */ @@ -232,10 +239,21 @@ static void flash__setup_umem(struct umem *umem) } umem->cfg->umem->buffer = packet_buffer; umem->cfg->umem->size = size; - - __configure_umem(umem); umem->cfg->umem_fd = fd; + size = FLASH_MAX_XSK * sizeof(uint8_t); + fd = create_memfd("POLLOUT_STATUS_MEM0", size); + flags = MAP_SHARED; + + umem->cfg->nf_pollout_status = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, fd, 0); + if (umem->cfg->nf_pollout_status == MAP_FAILED) { + log_error("ERROR: (POLLOUT_STATUS setup) mmap failed \"%s\"\n", strerror(errno)); + exit(EXIT_FAILURE); + } + umem->cfg->nf_pollout_status_size = size; + umem->cfg->nf_pollout_status_fd = fd; + + __configure_umem(umem); return; } @@ -326,6 +344,7 @@ static void init_config(struct config *cfg) cfg->umem->frame_size = FRAME_SIZE; cfg->xsk->batch_size = BATCH_SIZE; cfg->umem_fd = -1; + cfg->nf_pollout_status_fd = -1; } int configure_umem(struct nf_data *data, struct umem **_umem) diff --git a/lib/flash/nf/flash_nf.c b/lib/flash/nf/flash_nf.c index e8d4027..125909a 100644 --- a/lib/flash/nf/flash_nf.c +++ b/lib/flash/nf/flash_nf.c @@ -160,6 +160,7 @@ static int __configure(struct config *cfg, struct nf *nf, int **received_fd) goto clean_rcv_fd; } log_debug("ROUTE SIZE: %d", nf->next_size); + cfg->next_size = nf->next_size; if (flash__send_cmd(uds_sockfd, FLASH__GET_BIND_FLAGS) < 0) { log_error("Failed to send command to get bind flags"); @@ -224,6 +225,41 @@ static int __configure(struct config *cfg, struct nf *nf, int **received_fd) } log_debug("IFNAME: %s", cfg->ifname); + if (flash__send_cmd(uds_sockfd, FLASH__GET_POLLOUT_STATUS) < 0) { + log_error("Failed to send command to get pollout status fd, size, shared memory"); + goto clean_rcv_fd; + } + if (flash__recv_fd(uds_sockfd, &cfg->nf_pollout_status_fd) < 0) { + log_error("Failed to receive pollout status fd from UDS server"); + goto clean_rcv_fd; + } + log_debug("RECEIVED POLLOUT STATUS FD: %d", cfg->nf_pollout_status_fd); + if (flash__recv_data(uds_sockfd, &cfg->nf_pollout_status_size, sizeof(int)) < 0) { + log_error("Failed to receive pollout_status array size from UDS server"); + goto clean_rcv_fd; + } + log_debug("RECEIVED POLLOUT_STATUS SIZE: %d", cfg->nf_pollout_status_size); + + if (flash__send_cmd(uds_sockfd, FLASH__GET_PREV_NF) < 0) { + log_error("Failed to send command to get previous NFs"); + goto clean_rcv_fd; + } + if (flash__recv_data(uds_sockfd, &cfg->prev_size, sizeof(int)) < 0) { + log_error("Failed to receive number of previous NFs from UDS server"); + goto clean_rcv_fd; + } + log_debug("NUMBER OF PREVIOUS NFs: %d", cfg->prev_size); + cfg->prev = (int *)calloc(cfg->prev_size, sizeof(int)); + for (i = 0; i < cfg->prev_size; i++) { + int prev_nf_id; + if (flash__recv_data(uds_sockfd, &prev_nf_id, sizeof(int)) < 0) { + log_error("Failed to receive previous NF id from UDS server"); + free(cfg->prev); + goto clean_rcv_fd; + } + cfg->prev[i] = prev_nf_id; + log_debug("RECEIVED PREVIOUS NF ID: %d", prev_nf_id); + } return 0; clean_rcv_fd: @@ -419,6 +455,11 @@ void flash__xsk_close(struct config *cfg, struct nf *nf) off.cr.desc + cfg->umem_config->comp_size * sizeof(uint64_t)); } + close(cfg->nf_pollout_status_fd); + munmap((void *)(uintptr_t)cfg->nf_pollout_status, cfg->nf_pollout_status_size); + cfg->nf_pollout_status = NULL; + cfg->nf_pollout_status_size = 0; + free(nf->thread[i]->socket); free(nf->thread[i]); } @@ -482,6 +523,15 @@ int flash__configure_nf(struct nf **_nf, struct config *cfg) goto out_error; } + void *shm_ptr = mmap(NULL, cfg->nf_pollout_status_size, PROT_READ | PROT_WRITE, MAP_SHARED, cfg->nf_pollout_status_fd, 0); + if (shm_ptr == MAP_FAILED) { + log_error("ERROR: mmap failed: %s", strerror(errno)); + close(cfg->nf_pollout_status_fd); + goto out_error; + } + cfg->nf_pollout_status = (uint8_t *)shm_ptr; + cfg->nf_pollout_status[cfg->nf_id] = 0; + if (!size && !xsk_page_aligned(cfg->umem->buffer)) { log_error("ERROR: UMEM size is not page aligned \"%s\"", strerror(errno)); goto out_error; @@ -524,6 +574,8 @@ int flash__configure_nf(struct nf **_nf, struct config *cfg) nf->thread[i]->socket->ifqueue = cfg->ifqueue[i]; nf->thread[i]->socket->idle_fd.fd = sockfd[i]; nf->thread[i]->socket->idle_fd.events = POLLIN; + nf->thread[i]->socket->backpressure_fd.fd = sockfd[i]; + nf->thread[i]->socket->backpressure_fd.events = POLLOUT; if (xsk_mmap_umem_rings(nf->thread[i]->socket, *cfg->umem_config, *cfg->xsk_config) < 0) { log_error("ERROR: (Ring setup) mmap failed \"%s\"", strerror(errno)); diff --git a/lib/flash/nf/flash_stats.c b/lib/flash/nf/flash_stats.c index 5f851d3..8560c1f 100644 --- a/lib/flash/nf/flash_stats.c +++ b/lib/flash/nf/flash_stats.c @@ -157,6 +157,8 @@ void flash__dump_stats(struct config *cfg, struct socket *xsk) if (cfg->xsk->mode & FLASH__BUSY_POLL) { if (cfg->smart_poll) printf("busy-poll | smart-poll "); + else if (cfg->sleep_poll) + printf("busy-poll | sleep-poll "); else printf("busy-poll "); } diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 9f4e0bd..7db6fe9 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -120,6 +120,47 @@ static void __kick_tx(struct socket *xsk) exit(EXIT_FAILURE); } +static void __kick_rx(struct socket *xsk) +{ + int ret; + ret = recvfrom(xsk->fd, NULL, 0, MSG_MORE, NULL, 0); + + if (ret >= 0 || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) + return; + log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); + exit(EXIT_FAILURE); +} + +static uint32_t fill_ring_nb_entries(struct socket *xsk) +{ + return xsk->fill.size + (xsk->fill.cached_prod - xsk->fill.cached_cons); +} + +static uint32_t rx_ring_free_entries(struct socket *xsk) +{ + return xsk->rx.size - (xsk->rx.cached_prod - xsk->rx.cached_cons); +} + +static inline void __try_kick_rx(struct config *cfg, struct socket *xsk) +{ + if (!cfg->smart_poll) + return; + bool any = false; + for (int i = 0; i < cfg->prev_size; i++) { + if (cfg->nf_pollout_status[cfg->prev[i]] == 1) { + any = true; + break; + } + } + if ( + fill_ring_nb_entries(xsk) >= xsk->fill.size / 2 + && rx_ring_free_entries(xsk) > xsk->rx.size / 2 + && any + ) { + __kick_rx(xsk); + } +} + static inline void __complete_tx_rx_first(struct config *cfg, struct socket *xsk) { uint32_t idx_cq = 0, idx_fq = 0; @@ -213,6 +254,7 @@ static inline void __complete_tx_completions(struct config *cfg, struct socket * *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); xsk_ring_prod__submit(&xsk->fill, completed); + __try_kick_rx(cfg, xsk); } else { for (i = 0; i < completed; i++) { addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); @@ -270,11 +312,18 @@ static inline uint32_t __old_reserve_tx(struct config *cfg, struct socket *xsk, static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) { + if (cfg->xsk->mode & FLASH__BUSY_POLL && xsk->outstanding_tx > cfg->xsk->bp_thres / 2) { +#ifdef STATS + xsk->app_stats.tx_wakeup_sendtos++; +#endif + __complete_tx_completions(cfg, xsk); + __kick_tx(xsk); + } + uint32_t idx_tx = 0; - uint32_t ret; + uint32_t ret = 0; - ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); - while (ret != num) { + while ((cfg->smart_poll || cfg->sleep_poll) && xsk->outstanding_tx + num > cfg->xsk->bp_thres && cfg->next_size != 0) { __complete_tx_completions(cfg, xsk); if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { #ifdef STATS @@ -282,14 +331,31 @@ static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint #endif __kick_tx(xsk); } - ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); - - if (cfg->smart_poll && ret != num && xsk->outstanding_tx >= cfg->xsk->bp_thres) { + if (cfg->smart_poll) { + cfg->nf_pollout_status[cfg->nf_id] = 1; + ret = flash__oldpoll(xsk, &xsk->backpressure_fd, 1, 1000); + cfg->nf_pollout_status[cfg->nf_id] = 0; + // will wake up when any of the next_nf->fill_ring->buffers > next_nf->fill_ring->size / 2 + // and next_nf->rx_ring_free > next_nf->rx_ring_size / 2 + // and current_nf->outstanding_tx < current_nf->xsk->tx->size / 2 + } else if (cfg->sleep_poll) { usleep(cfg->xsk->bp_timeout); + } + xsk->idle_timestamp = 0; + #ifdef STATS + xsk->app_stats.backpressure++; + #endif + } + ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); + while (ret != num) { + __complete_tx_completions(cfg, xsk); + if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { #ifdef STATS - xsk->app_stats.backpressure++; + xsk->app_stats.tx_wakeup_sendtos++; #endif + __kick_tx(xsk); } + ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); } return idx_tx; } @@ -496,6 +562,9 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk __replenish_fill_ring(cfg, xsk, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); + + __try_kick_rx(cfg, xsk); + #ifdef STATS xsk->ring_stats.rx_npkts += eop_cnt; xsk->ring_stats.rx_frags += rcvd; @@ -642,6 +711,7 @@ size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk flash_pool__put(xsk->flash_pool, addr); } } + __try_kick_rx(cfg, xsk); #ifdef STATS xsk->ring_stats.drop_npkts += ndrop; diff --git a/lib/flash/params/flash_params.c b/lib/flash/params/flash_params.c index 346ef47..bd60b99 100644 --- a/lib/flash/params/flash_params.c +++ b/lib/flash/params/flash_params.c @@ -31,6 +31,8 @@ const struct option_wrapper long_options[] = { { { "smart-poll", no_argument, NULL, 'p' }, "Smart polling mode [default: disabled]" }, + { { "sleep-poll", no_argument, NULL, 's' }, "Periodic sleep mode [default: disabled]" }, + { { "idle-timeout", required_argument, NULL, 'i' }, "Idle timeout for smart polling mode in ms [default: 100]", "" }, { { "idleness", required_argument, NULL, 'I' }, @@ -40,7 +42,7 @@ const struct option_wrapper long_options[] = { { { "timeout", required_argument, NULL, 'b' }, "Sleep duration on backpressure in us [default: 1000]", "" }, { { "bp-sense", required_argument, NULL, 'B' }, - "Sensitivity for detecting backpressure, 0: 0 pkts - 1: 2048 pkts [default: 0.5]", + "Sensitivity for detecting backpressure, 0: 0 pkts - 1: 2048 pkts [default: 1]", "" }, { { "frags", no_argument, NULL, 'F' }, "Enable frags (multi-buffer) support -- not implemented yet", false }, @@ -155,7 +157,7 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper } /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "u:f:taxn:Qpi:I:b:B:Fw:h", long_options, &longindex)) != -1) { + while ((opt = getopt_long(argc, argv, "u:f:taxn:Qpsi:I:b:B:Fw:h", long_options, &longindex)) != -1) { switch (opt) { case 'u': cfg->umem_id = atoi(optarg); @@ -181,6 +183,9 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper case 'p': cfg->smart_poll = true; break; + case 's': + cfg->sleep_poll = true; + break; case 'i': cfg->xsk->idle_timeout = atoi(optarg); break; @@ -256,11 +261,12 @@ int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg) cfg->extra_stats = false; cfg->verbose = true; cfg->smart_poll = false; + cfg->sleep_poll = false; cfg->xsk->idle_timeout = 100; cfg->xsk->poll_timeout = -1; cfg->xsk->idle_thres = 0; cfg->xsk->bp_timeout = 1000; - cfg->xsk->bp_thres = (__u32)(XSK_RING_PROD__DEFAULT_NUM_DESCS * 0.5); + cfg->xsk->bp_thres = (__u32)(XSK_RING_PROD__DEFAULT_NUM_DESCS); ret = parse_cmdline_args(argc, argv, long_options, cfg); if (ret < 0) diff --git a/lib/flash/uds/flash_uds.h b/lib/flash/uds/flash_uds.h index 8d10f45..fdc36af 100644 --- a/lib/flash/uds/flash_uds.h +++ b/lib/flash/uds/flash_uds.h @@ -25,6 +25,8 @@ #define FLASH__GET_IFNAME 13 #define FLASH__GET_IP_ADDR 14 #define FLASH__GET_DST_IP_ADDR 15 +#define FLASH__GET_POLLOUT_STATUS 16 +#define FLASH__GET_PREV_NF 17 /* UDS Control path APIs*/ diff --git a/lib/include/flash_defines.h b/lib/include/flash_defines.h index abf86b0..b0c8bb3 100644 --- a/lib/include/flash_defines.h +++ b/lib/include/flash_defines.h @@ -30,6 +30,8 @@ #define MS_PER_S 1000 +#define FLASH_MAX_XSK 64 + struct xsk_config { uint32_t bind_flags; uint32_t xdp_flags; @@ -64,6 +66,7 @@ struct config { struct xsk_umem_config *umem_config; struct xsk_socket_config *xsk_config; bool smart_poll; + bool sleep_poll; bool custom_xsk; int umem_id; int nf_id; @@ -71,6 +74,12 @@ struct config { bool frags_enabled; bool rx_first; volatile bool *done; + int next_size; + int nf_pollout_status_fd; + int nf_pollout_status_size; + volatile uint8_t *nf_pollout_status; + int *prev; + int prev_size; #ifdef STATS clockid_t clock; int verbose; @@ -172,6 +181,7 @@ struct socket { struct xsk_ring_prod fill; struct xsk_ring_cons comp; struct pollfd idle_fd; + struct pollfd backpressure_fd; bool idle; void *flash_pool; uint32_t outstanding_tx; @@ -201,6 +211,8 @@ struct nf { uint16_t port; int *next; // To be removed int next_size; + int *prev; + int prev_size; struct thread **thread; bool is_up; int thread_count; diff --git a/monitor/main.c b/monitor/main.c index c021cec..bc8654d 100644 --- a/monitor/main.c +++ b/monitor/main.c @@ -104,6 +104,22 @@ static void *handle_nf(void *arg) flash__send_data(msgsock, umem->nf[umem->nf[data->nf_id]->next[i]]->ip, INET_ADDRSTRLEN); } break; + case FLASH__GET_POLLOUT_STATUS: + flash__send_fd(msgsock, umem->cfg->nf_pollout_status_fd); + flash__send_data(msgsock, &umem->cfg->nf_pollout_status_size, sizeof(int)); + log_info("SENT POLLOUT STATUS MEM_FD: %d", umem->cfg->nf_pollout_status_fd); + log_info("SENT POLLOUT STATUS MEM_SIZE: %d", umem->cfg->nf_pollout_status_size); + break; + + case FLASH__GET_PREV_NF: + flash__send_data(msgsock, &umem->nf[data->nf_id]->prev_size, sizeof(int)); + log_info("Number of Previous NFs: %d", umem->nf[data->nf_id]->prev_size); + for (int i = 0; i < umem->nf[data->nf_id]->prev_size; i++) { + int prev_nf_id = umem->nf[data->nf_id]->prev[i]; + log_info("Sending Previous NF: %d", prev_nf_id); + flash__send_data(msgsock, &prev_nf_id, sizeof(int)); + } + break; case FLASH__CLOSE_CONN: close_nf(umem, data->umem_id, data->nf_id); From f8e39bbfbc612df6fde18001a0086e21c07f8f5e Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 17 Sep 2025 10:16:09 +0530 Subject: [PATCH 26/43] fix: fixed flash__recvmsg to support sleep_poll --- lib/flash/nf/flash_txrx.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 7db6fe9..728380a 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -152,11 +152,7 @@ static inline void __try_kick_rx(struct config *cfg, struct socket *xsk) break; } } - if ( - fill_ring_nb_entries(xsk) >= xsk->fill.size / 2 - && rx_ring_free_entries(xsk) > xsk->rx.size / 2 - && any - ) { + if (fill_ring_nb_entries(xsk) >= xsk->fill.size / 2 && rx_ring_free_entries(xsk) > xsk->rx.size / 2 && any) { __kick_rx(xsk); } } @@ -342,9 +338,9 @@ static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint usleep(cfg->xsk->bp_timeout); } xsk->idle_timestamp = 0; - #ifdef STATS - xsk->app_stats.backpressure++; - #endif +#ifdef STATS + xsk->app_stats.backpressure++; +#endif } ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); while (ret != num) { @@ -509,7 +505,7 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk /* Ensures that rx can happen during tx pressure */ __complete_tx_completions(cfg, xsk); - if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { + if ((cfg->smart_poll || cfg->sleep_poll) && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, -1); if (ret <= 0) { xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); @@ -528,13 +524,13 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); } - if (cfg->smart_poll && cfg->xsk->idle_timeout && !xsk->idle_timestamp) + if ((cfg->smart_poll || cfg->sleep_poll) && cfg->xsk->idle_timeout && !xsk->idle_timestamp) xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); return 0; } - if (cfg->smart_poll && (rcvd >= cfg->xsk->idle_thres || xsk->outstanding_tx)) + if ((cfg->smart_poll || cfg->sleep_poll) && (rcvd >= cfg->xsk->idle_thres || xsk->outstanding_tx)) xsk->idle_timestamp = 0; if (rcvd > cfg->xsk->batch_size) From ff4509537867ce2034918e3b1fb4e5e8b1641f51 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 23 Oct 2025 13:04:49 +0530 Subject: [PATCH 27/43] feat: Mitigate Head-of-Line Blocking in Multiple Flows - Added new option track_tx_budget to track outstanding_tx per outgoing edge (disabled by default). - Introduced an option to specify the total maximum outstanding TX when track_tx_budget is enabled (default: 256). - Added new API: flash__track_tx_and_drop. When track_tx_budget is enabled, this function selectively transmits or drops packets to maximize total throughput. - Added new NF example: multi-flow-tx.c demonstrates the usage of the above features. --- examples/unit-tests/meson.build | 5 +- examples/unit-tests/multi-flow-tx.c | 512 ++++++++++++++++++++++++++++ lib/flash/nf/flash_nf.c | 13 + lib/flash/nf/flash_nf.h | 18 + lib/flash/nf/flash_txrx.c | 56 ++- lib/flash/params/flash_params.c | 27 +- lib/include/flash_defines.h | 6 + 7 files changed, 633 insertions(+), 4 deletions(-) create mode 100644 examples/unit-tests/multi-flow-tx.c diff --git a/examples/unit-tests/meson.build b/examples/unit-tests/meson.build index b79c542..15756b2 100644 --- a/examples/unit-tests/meson.build +++ b/examples/unit-tests/meson.build @@ -17,4 +17,7 @@ ring_benchmark = files('ring-benchmark.c') executable('ring-benchmark', ring_benchmark, c_args: cflags, install: true, dependencies: deps) backpressure = files('backpressure.c') -executable('backpressure', backpressure, c_args: cflags, install: true, dependencies: deps) \ No newline at end of file +executable('backpressure', backpressure, c_args: cflags, install: true, dependencies: deps) + +multi_flow_tx = files('multi-flow-tx.c') +executable('multi-flow-tx', multi_flow_tx, c_args: cflags, install: true, dependencies: deps) \ No newline at end of file diff --git a/examples/unit-tests/multi-flow-tx.c b/examples/unit-tests/multi-flow-tx.c new file mode 100644 index 0000000..b488bcb --- /dev/null +++ b/examples/unit-tests/multi-flow-tx.c @@ -0,0 +1,512 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025 Debojeet Das + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define TEST_PORT 8080 + +volatile bool done = false; +struct config *cfg = NULL; +struct nf *nf; +struct test_stats *stats_arr; + +static void int_exit(int sig) +{ + log_info("Received Signal: %d", sig); + done = true; +} + +struct testHeader { + uint8_t lastHop; + uint8_t hopCount; + uint64_t pktId; + uint16_t old_dst; +}; + +struct test_stats { + uint64_t pkt_count; +}; + +struct appconf { + int cpu_start; + int cpu_end; + int stats_cpu; + int hops; + __u64 burn_cycles; + bool variable; + __u64 variable_start; + __u64 variable_end; +} app_conf; + +#if defined(__ARM_ARCH_ISA_A64) +// ARM64 based implementation +static inline __u64 rdtsc(void) +{ + __u64 cntvct; + asm volatile("mrs %0, cntvct_el0; " : "=r"(cntvct)::"memory"); + return cntvct; +} + +static inline __u64 rdtsc_precise(void) +{ + __u64 cntvct; + asm volatile("isb; mrs %0, cntvct_el0; isb; " : "=r"(cntvct)::"memory"); + return cntvct; +} +#elif defined(__x86_64__) +// AMD64 based implementation +static inline __u64 rdtsc(void) +{ + union { + __u64 tsc_64; + struct { + __u32 lo_32; + __u32 hi_32; + }; + } tsc; + + asm volatile("rdtsc" : "=a"(tsc.lo_32), "=d"(tsc.hi_32)); + + return tsc.tsc_64; +} + +static inline __u64 rdtsc_precise(void) +{ + asm volatile("mfence"); + return rdtsc(); +} +#endif + +static void burn_cycles(__u64 cycles_to_burn) +{ + __u64 start = rdtsc(); + while ((rdtsc() - start) < cycles_to_burn) { + // Burn cycles + } +} + +static const char *flow_bp_options[] = { "-c \tStart CPU (default: 0)", + "-e \tEnd CPU (default: 0)", + "-s \tStats CPU (default: 1)", + "-h \tNumber of hops (default: 1)", + "-B \tBurn cycles (default: 0)", + "-v\t\tEnable variable-length packets (default: disabled)", + "-a \tVariable start value (default: 0)", + "-z \tVariable end value (default: 0)", + NULL }; + +static int parse_app_args(int argc, char **argv, struct appconf *app_conf, int shift) +{ + int c; + opterr = 0; + + // Default values + app_conf->cpu_start = 0; + app_conf->cpu_end = 0; + app_conf->stats_cpu = 1; + app_conf->hops = 1; + app_conf->burn_cycles = 0; + app_conf->variable = false; + app_conf->variable_start = 0; + app_conf->variable_end = 0; + + argc -= shift; + argv += shift; + + while ((c = getopt(argc, argv, "c:e:s:h:B:va:z:")) != -1) + switch (c) { + case 'c': + app_conf->cpu_start = atoi(optarg); + break; + case 'e': + app_conf->cpu_end = atoi(optarg); + break; + case 's': + app_conf->stats_cpu = atoi(optarg); + break; + case 'h': + app_conf->hops = atoi(optarg); + break; + case 'B': + app_conf->burn_cycles = atoi(optarg); + break; + case 'v': + app_conf->variable = true; + break; + case 'a': + app_conf->variable_start = atoi(optarg); + break; + case 'z': + app_conf->variable_end = atoi(optarg); + break; + default: + printf("Usage: %s -h\n", argv[-shift]); + return -1; + } + return 0; +} + +static void process_packets(void *data, __u32 *len) +{ + void *pos = data; + void *data_end = data + *len; + + struct ethhdr *eth = (struct ethhdr *)pos; + if ((void *)(eth + 1) > data_end) { + log_error("Ethernet header is not valid"); + return; + } + + if (eth->h_proto != htons(ETH_P_IP)) { + log_error("Ethernet protocol is not IP"); + return; + } + + pos = eth + 1; + + struct iphdr *iph = pos; + size_t hdrsize; + + if ((void *)iph + 1 > data_end) { + log_error("IP header is not valid"); + return; + } + + hdrsize = iph->ihl * 4; + /* Sanity check packet field is valid */ + if (hdrsize < sizeof(*iph)) { + log_error("IP header size is invalid"); + return; + } + + if (iph->protocol != IPPROTO_UDP) { + log_error("IP protocol is not UDP"); + return; + } + + /* Variable-length IPv4 header, need to use byte-based arithmetic */ + if (pos + hdrsize > data_end) { + log_error("IP header is not valid"); + return; + } + + pos += hdrsize; + + size_t payload_len; + struct udphdr *udphdr = pos; + + if ((void *)udphdr + 1 > data_end) { + log_error("UDP header is not valid"); + return; + } + + pos = udphdr + 1; + payload_len = ntohs(udphdr->len) - sizeof(struct udphdr); + + size_t testHeaderLen = sizeof(struct testHeader); + void *payload_end = pos + payload_len; + + struct testHeader *testHeader = NULL; + + /* First NF */ + if (ntohs(udphdr->dest) != TEST_PORT) { + // Append test header at the end of the UDP payload + testHeader = (struct testHeader *)payload_end; + testHeader->lastHop = app_conf.hops; + testHeader->hopCount = 1; + testHeader->old_dst = udphdr->dest; + + *len += testHeaderLen; + udphdr->len = htons(ntohs(udphdr->len) + testHeaderLen); + iph->tot_len = htons(ntohs(iph->tot_len) + testHeaderLen); + + udphdr->dest = htons(TEST_PORT); + } else { + // check if the test header is present + if (payload_len < testHeaderLen) { + log_error("ERROR: Test header not found in packet"); + return; + } + + // testHeader is at the end of the UDP payload + testHeader = (struct testHeader *)(payload_end - testHeaderLen); + testHeader->hopCount++; + } + + if (testHeader->lastHop == testHeader->hopCount) { + uint8_t tmp_mac[ETH_ALEN]; + struct in_addr tmp_ip; + unsigned short tmp_port; + payload_len -= testHeaderLen; + + tmp_port = testHeader->old_dst; + + udphdr->dest = tmp_port; + udphdr->len = htons(ntohs(udphdr->len) - testHeaderLen); + *len -= testHeaderLen; + + tmp_port = udphdr->dest; + udphdr->dest = udphdr->source; + udphdr->source = tmp_port; + + iph->tot_len = htons(ntohs(iph->tot_len) - testHeaderLen); + + memcpy(tmp_mac, eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, eth->h_source, ETH_ALEN); + memcpy(eth->h_source, tmp_mac, ETH_ALEN); + + memcpy(&tmp_ip, &iph->saddr, sizeof(tmp_ip)); + memcpy(&iph->saddr, &iph->daddr, sizeof(tmp_ip)); + memcpy(&iph->daddr, &tmp_ip, sizeof(tmp_ip)); + } + + return; +} + +struct sock_args { + int socket_id; + int next_size; +}; + +static void *socket_routine(void *arg) +{ + nfds_t nfds = 1; + int ret, next_size; + struct socket *xsk; + struct xskvec *xskvecs, *sendvecs, *dropvecs; + struct pollfd fds[1] = {}; + uint32_t i, nrecv, nsend, count, nb_frags = 0, wsend, wdrop, ndrop; + struct sock_args *a = (struct sock_args *)arg; + + next_size = a->next_size; + + log_debug("SOCKET_ID: %d", a->socket_id); + xsk = nf->thread[a->socket_id]->socket; + + cfg->xsk->poll_timeout = -1; + + xskvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!xskvecs) { + log_error("ERROR: Memory allocation failed for xskvecs"); + return NULL; + } + + sendvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!sendvecs) { + log_error("ERROR: Memory allocation failed for sendvecs"); + free(xskvecs); + return NULL; + } + + dropvecs = calloc(cfg->xsk->batch_size, sizeof(struct xskvec)); + if (!dropvecs) { + log_error("ERROR: Memory allocation failed for dropvecs"); + free(xskvecs); + free(sendvecs); + return NULL; + } + + fds[0].fd = xsk->fd; + fds[0].events = POLLIN; + + count = 0; + for (;;) { + ret = flash__poll(cfg, xsk, fds, nfds); + if (!(ret == 1 || ret == -2)) + continue; + + nrecv = flash__recvmsg(cfg, xsk, xskvecs, cfg->xsk->batch_size); + + for (i = 0; i < nrecv; i++) { + if (next_size != 0) { + xskvecs[i].options = ((count % next_size) << 16) | (xskvecs[i].options & 0xFFFF); + count++; + } + char *pkt = xskvecs[i].data; + + if (!nb_frags++) + process_packets(pkt, &xskvecs[i].len); + + if (IS_EOP_DESC(xskvecs[i].options)) + nb_frags = 0; + + if (app_conf.variable) { + __u64 random_cycles = + app_conf.variable_start + (rand() % (app_conf.variable_end - app_conf.variable_start)); + burn_cycles(random_cycles); + + } else { + burn_cycles(app_conf.burn_cycles); + } + } + + flash__track_tx_and_drop(cfg, xsk, xskvecs, nrecv, sendvecs, &wsend, dropvecs, &wdrop); + + if (nrecv) { + nsend = flash__sendmsg(cfg, xsk, sendvecs, wsend); + if (nsend != wsend) { + log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); + break; + } + ndrop = flash__dropmsg(cfg, xsk, dropvecs, wdrop); + if (ndrop != wdrop) { + log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); + break; + } + } + + if (done) + break; + } + free(xskvecs); + free(sendvecs); + free(dropvecs); + return NULL; +} + +static void *worker__stats(void *conf) +{ + struct stats_conf *arg = (struct stats_conf *)conf; + struct nf *nf = arg->nf; + struct config *cfg = arg->cfg; + + if (cfg->verbose) { + unsigned int interval = cfg->stats_interval; + setlocale(LC_ALL, ""); + + for (int i = 0; i < cfg->total_sockets; i++) + nf->thread[i]->socket->timestamp = flash__get_nsecs(cfg); + + while (!done) { + sleep(interval); + if (system("clear") != 0) + log_error("Terminal clear error"); + for (int i = 0; i < cfg->total_sockets; i++) { + flash__dump_stats(cfg, nf->thread[i]->socket); + } + } + } + return NULL; +} + +int main(int argc, char **argv) +{ + int shift; + struct sock_args *args; + struct stats_conf stats_cfg = { NULL }; + cpu_set_t cpuset; + pthread_t socket_thread, stats_thread; + + cfg = calloc(1, sizeof(struct config)); + if (!cfg) { + log_error("ERROR: Memory allocation failed\n"); + exit(EXIT_FAILURE); + } + + cfg->app_name = "Flow Backpressure Application"; + cfg->app_options = flow_bp_options; + cfg->done = &done; + + shift = flash__parse_cmdline_args(argc, argv, cfg); + if (shift < 0) + goto out_cfg; + + if (parse_app_args(argc, argv, &app_conf, shift) < 0) + goto out_cfg; + + if (flash__configure_nf(&nf, cfg) < 0) + goto out_cfg; + + stats_arr = calloc(cfg->total_sockets, sizeof(struct test_stats)); + if (!stats_arr) { + log_error("ERROR: Memory allocation failed for stats_arr"); + goto out_cfg; + } + + log_info("Control Plane Setup Done"); + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + signal(SIGABRT, int_exit); + + log_info("STARTING Data Path"); + + args = calloc(cfg->total_sockets, sizeof(struct sock_args)); + if (!args) { + log_error("ERROR: Memory allocation failed for sock_args"); + goto out_cfg_close; + } + + for (int i = 0; i < cfg->total_sockets; i++) { + args[i].socket_id = i; + args[i].next_size = nf->next_size; + + log_info("2_NEXT_SIZE: %d", args[i].next_size); + + if (pthread_create(&socket_thread, NULL, socket_routine, &args[i])) { + log_error("Error creating socket thread"); + goto out_args; + } + CPU_ZERO(&cpuset); + CPU_SET((i % (app_conf.cpu_end - app_conf.cpu_start + 1)) + app_conf.cpu_start, &cpuset); + if (pthread_setaffinity_np(socket_thread, sizeof(cpu_set_t), &cpuset) != 0) { + log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); + goto out_args; + } + + if (pthread_detach(socket_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + } + + stats_cfg.nf = nf; + stats_cfg.cfg = cfg; + + if (pthread_create(&stats_thread, NULL, worker__stats, &stats_cfg)) { + log_error("Error creating statistics thread"); + goto out_args; + } + CPU_ZERO(&cpuset); + CPU_SET(app_conf.stats_cpu, &cpuset); + if (pthread_setaffinity_np(stats_thread, sizeof(cpu_set_t), &cpuset) != 0) { + log_error("ERROR: Unable to set thread affinity: %s\n", strerror(errno)); + goto out_args; + } + if (pthread_detach(stats_thread) != 0) { + log_error("ERROR: Unable to detach thread: %s", strerror(errno)); + goto out_args; + } + + flash__wait(cfg); + + flash__xsk_close(cfg, nf); + + return EXIT_SUCCESS; + +out_args: + done = true; + free(args); +out_cfg_close: + free(stats_arr); + sleep(1); + flash__xsk_close(cfg, nf); +out_cfg: + free(cfg); + exit(EXIT_FAILURE); +} diff --git a/lib/flash/nf/flash_nf.c b/lib/flash/nf/flash_nf.c index 125909a..cfd0e78 100644 --- a/lib/flash/nf/flash_nf.c +++ b/lib/flash/nf/flash_nf.c @@ -577,6 +577,19 @@ int flash__configure_nf(struct nf **_nf, struct config *cfg) nf->thread[i]->socket->backpressure_fd.fd = sockfd[i]; nf->thread[i]->socket->backpressure_fd.events = POLLOUT; + if (nf->next_size != 0) { + for (int j = 0; j < nf->next_size; j++) { + nf->thread[i]->socket->per_edge_max_outstanding_tx[j] = 1; + } + } else { + nf->thread[i]->socket->per_edge_max_outstanding_tx[0] = 1; + } + nf->thread[i]->socket->completed_tx_descs = (int *)calloc(cfg->max_outstanding_tx, sizeof(int)); + memset(nf->thread[i]->socket->completed_tx_descs, -1, sizeof(int) * cfg->max_outstanding_tx); + for (int j = 0; j < nf->next_size; j++) { + nf->thread[i]->socket->completed_tx_descs[nf->thread[i]->socket->completed_idx++] = j; + } + if (xsk_mmap_umem_rings(nf->thread[i]->socket, *cfg->umem_config, *cfg->xsk_config) < 0) { log_error("ERROR: (Ring setup) mmap failed \"%s\"", strerror(errno)); goto out_error; diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index 8f1cee5..5ea4c83 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -105,6 +105,24 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk */ size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nsend); +/** + * Use this function in when there are multiple next NFs with different/variable throughput. + * It tracks the outstanding tx per NF and drops packets if necessary. + * + * @param cfg: Pointer to the configuration structure. + * @param xsk: Pointer to the socket structure. + * @param xskvecs: Pointer to the array of xskvec structures containing received data. + * @param nrecv: Number of messages received. + * @param sendvecs: Pointer to the array of xskvec structures to send data + * @param nsend: Pointer to the number of messages to send. + * @param dropvecs: Pointer to the array of xskvec structures to drop data + * @param ndrop: Pointer to the number of messages to drop. + * + * @return void + */ +void flash__track_tx_and_drop(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nrecv, struct xskvec *sendvecs, + uint32_t *nsend, struct xskvec *dropvecs, uint32_t *ndrop); + /** * Drop messages from the socket. * diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 728380a..793bd8e 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -210,6 +210,7 @@ static inline void __complete_tx_completions(struct config *cfg, struct socket * uint32_t idx_cq = 0, idx_fq = 0; uint32_t completed, num_outstanding, i, ret; uint64_t addr; + int temp; if (!xsk->outstanding_tx) return; @@ -246,14 +247,43 @@ static inline void __complete_tx_completions(struct config *cfg, struct socket * ret = xsk_ring_prod__reserve(&xsk->fill, completed, &idx_fq); } - for (i = 0; i < completed; i++) - *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); + for (i = 0; i < completed; i++) { + addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); + if (cfg->track_tx_budget && cfg->next_size != 0) { + uint8_t *data_ptr = (uint8_t *)(cfg->umem->buffer + addr); // edge_id + xsk->per_edge_outstanding[*data_ptr]--; + temp = xsk->completed_tx_descs[xsk->completed_idx]; + if (temp != -1) { + xsk->per_edge_max_outstanding_tx[temp]--; + } + xsk->per_edge_max_outstanding_tx[*data_ptr]++; + xsk->completed_tx_descs[xsk->completed_idx] = *data_ptr; + xsk->completed_idx = (xsk->completed_idx + 1) & (cfg->max_outstanding_tx - 1); + + *data_ptr = 0; + } + *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = addr; + } xsk_ring_prod__submit(&xsk->fill, completed); __try_kick_rx(cfg, xsk); } else { for (i = 0; i < completed; i++) { addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); + if (cfg->track_tx_budget && cfg->next_size != 0) { + uint8_t *data_ptr = (uint8_t *)(cfg->umem->buffer + addr); // edge_id + xsk->per_edge_outstanding[*data_ptr]--; + + temp = xsk->completed_tx_descs[xsk->completed_idx]; + if (temp != -1) { + xsk->per_edge_max_outstanding_tx[temp]--; + } + xsk->per_edge_max_outstanding_tx[*data_ptr]++; + xsk->completed_tx_descs[xsk->completed_idx] = *data_ptr; + xsk->completed_idx = (xsk->completed_idx + 1) & (cfg->max_outstanding_tx - 1); + + *data_ptr = 0; + } flash_pool__put(xsk->flash_pool, addr); } } @@ -658,6 +688,28 @@ size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk return nsend; } +void flash__track_tx_and_drop(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nrecv, struct xskvec *sendvecs, + uint32_t *nsend, struct xskvec *dropvecs, uint32_t *ndrop) +{ + uint32_t i, next_size = cfg->next_size, wsend = 0, wdrop = 0, edge; + for (i = 0; i < nrecv; i++) { + if (next_size == 0 || !cfg->track_tx_budget) { + sendvecs[wsend++] = xskvecs[i]; + continue; + } + + edge = (xskvecs[i].options >> 16) & 0xFFFF; + if (xsk->per_edge_max_outstanding_tx[edge] > xsk->per_edge_outstanding[edge]) { + xsk->per_edge_outstanding[edge]++; + sendvecs[wsend++] = xskvecs[i]; + } else { + dropvecs[wdrop++] = xskvecs[i]; + } + } + *nsend = wsend; + *ndrop = wdrop; +} + size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop) { uint32_t i; diff --git a/lib/flash/params/flash_params.c b/lib/flash/params/flash_params.c index bd60b99..9488b8b 100644 --- a/lib/flash/params/flash_params.c +++ b/lib/flash/params/flash_params.c @@ -49,6 +49,13 @@ const struct option_wrapper long_options[] = { { { "clock", required_argument, NULL, 'w' }, "Clock NAME (default MONOTONIC) -- not implemented yet", "", false }, + { { "track-outstanding-tx", no_argument, NULL, 'o' }, "Track outstanding Tx for each outgoing edge [default: false]", false }, + + { { "max-outstanding-tx", required_argument, NULL, 'O' }, + "Maximum outstanding Tx packets for this NF (default: 256 (only in powers of 2))", + "", + false }, + { { 0, 0, NULL, 0 }, NULL, false } }; @@ -157,7 +164,7 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper } /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "u:f:taxn:Qpsi:I:b:B:Fw:h", long_options, &longindex)) != -1) { + while ((opt = getopt_long(argc, argv, "u:f:taxn:Qpsi:I:b:B:Fw:hoO:", long_options, &longindex)) != -1) { switch (opt) { case 'u': cfg->umem_id = atoi(optarg); @@ -205,6 +212,22 @@ static int parse_cmdline_args(int argc, char **argv, const struct option_wrapper if (get_clockid(&cfg->clock, optarg)) log_warn("ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.", optarg); break; + case 'o': + cfg->track_tx_budget = true; + break; + case 'O': + cfg->max_outstanding_tx = atoi(optarg); + // if not power of 2, make this nearest power of 2, floor + if (cfg->max_outstanding_tx & (cfg->max_outstanding_tx - 1)) { + int power = 1; + while (power <= cfg->max_outstanding_tx) + power <<= 1; + power >>= 1; + log_warn("WARNING: --max-outstanding-tx=%d is not a power of two. Using %d instead.", + cfg->max_outstanding_tx, power); + cfg->max_outstanding_tx = power; + } + break; case 'h': full_help = true; /* fall-through */ @@ -267,6 +290,8 @@ int flash__parse_cmdline_args(int argc, char **argv, struct config *cfg) cfg->xsk->idle_thres = 0; cfg->xsk->bp_timeout = 1000; cfg->xsk->bp_thres = (__u32)(XSK_RING_PROD__DEFAULT_NUM_DESCS); + cfg->track_tx_budget = false; + cfg->max_outstanding_tx = 256; ret = parse_cmdline_args(argc, argv, long_options, cfg); if (ret < 0) diff --git a/lib/include/flash_defines.h b/lib/include/flash_defines.h index b0c8bb3..f2f6315 100644 --- a/lib/include/flash_defines.h +++ b/lib/include/flash_defines.h @@ -80,6 +80,8 @@ struct config { volatile uint8_t *nf_pollout_status; int *prev; int prev_size; + bool track_tx_budget; + int max_outstanding_tx; #ifdef STATS clockid_t clock; int verbose; @@ -186,6 +188,10 @@ struct socket { void *flash_pool; uint32_t outstanding_tx; uint64_t idle_timestamp; + int per_edge_max_outstanding_tx[FLASH_MAX_XSK]; + int per_edge_outstanding[FLASH_MAX_XSK]; + int* completed_tx_descs; + int completed_idx; #ifdef STATS struct xsk_ring_stats ring_stats; struct xsk_app_stats app_stats; From 5a37cc179fe4e00adabd3d37dbfdad118253b42b Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 29 Oct 2025 18:13:49 +0530 Subject: [PATCH 28/43] fix: install script says kernel not installed --- usertools/flash_kernel/install.sh | 71 +++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/usertools/flash_kernel/install.sh b/usertools/flash_kernel/install.sh index 2651882..36bdbc0 100755 --- a/usertools/flash_kernel/install.sh +++ b/usertools/flash_kernel/install.sh @@ -12,10 +12,12 @@ if [ "$EUID" -ne 0 ] fi if [ "$1" = "" ]; then - echo "Provide the kernel directory as an argument" echo "Usage: $0 [nproc]" - echo "[nproc] is optional" - echo "Example: $0 /path/to/kernel 7" + echo "" + echo " is the path to the kernel source directory" + echo "[nproc] is the number of parallel jobs to use for compilation. (default: number of CPU cores)" + echo "" + echo "Example: $0 /path/to/kernel 2" exit 1 fi @@ -24,42 +26,67 @@ set -u # Take the kernel directory as an argument KERNEL_DIR=$1 -NPROC=1 -if [ "$2" != "" ]; then - NPROC=$2 +NPROC=$(nproc) +if [ -n "${2:-}" ]; then + NPROC="$2" fi +echo "Using $NPROC parallel jobs for compilation..." + # Change to kernel directory -cd $KERNEL_DIR +cd "$KERNEL_DIR" -printf "Installing dependencies...\n" -sudo apt install -y build-essential libncurses-dev bison flex libssl-dev libelf-dev fakeroot dwarves +echo "Installing dependencies..." +sudo apt install -y build-essential libncurses-dev bison flex libssl-dev libelf-dev fakeroot dwarves bc -# Configure kernel -printf "Configuring kernel...\n" -cp -v /boot/config-$(uname -r) .config -(yes "" || true) | make localmodconfig +# Prompt if the user wants quick build +echo -n "Do you want to perform a quick build? (Y/n) " +read -r QUICK_BUILD +if [[ "$QUICK_BUILD" == "n" || "$QUICK_BUILD" == "N" ]]; then + echo "Performing full configuration..." + make olddefconfig +else + echo "Configuring kernel with quick build..." + cp -v /boot/config-$(uname -r) .config + (yes "" || true) | make localmodconfig +fi scripts/config --disable SYSTEM_TRUSTED_KEYS scripts/config --disable SYSTEM_REVOCATION_KEYS scripts/config --set-str CONFIG_SYSTEM_TRUSTED_KEYS "" scripts/config --set-str CONFIG_SYSTEM_REVOCATION_KEYS "" +GCC_MAJOR=$(gcc -dumpfullversion -dumpversion | cut -d. -f1) +EXTRA_FLAGS="" +if [ "$GCC_MAJOR" -ge 15 ]; then + echo "GCC $GCC_MAJOR detected — installing GCC 14..." + sudo apt install -y gcc-14 g++-14 + echo "Using GCC-14 for kernel build..." + EXTRA_FLAGS="CC=gcc-14 HOSTCC=gcc-14" +fi + # Compile the kernel -printf "Compiling kernel...\n" -make -j$(nproc) +echo "Compiling kernel..." +make -j"$NPROC" $EXTRA_FLAGS # Install the kernel -printf "Installing kernel...\n" -sudo make modules_install -sudo make install +echo "Installing kernel..." +sudo make modules_install $EXTRA_FLAGS +sudo make install $EXTRA_FLAGS set +xu -if [ -z "$(awk -F\' '/menuentry / {print $2}' /boot/grub/grub.cfg | grep -m 1 'Ubuntu, with Linux 6.10.6-flash+')" ]; then - printf "Cannot find flash kernel. Please install the kernel manually.\n" +FLASH_KERNEL=$(awk -F"'" '/menuentry / {print $2}' /boot/grub/grub.cfg \ + | grep -m 1 -E 'Ubuntu, with Linux 6\.10\.6-[0-9.]+-flash\+') + +if [ -z "$FLASH_KERNEL" ]; then + echo "Cannot find flash kernel. Please install the kernel manually." exit 1 fi -printf "flash kernel is installed. To boot into flash kernel, please reboot the system:\n" -printf " sudo reboot\n" \ No newline at end of file +echo "flash kernel is installed." +echo "To boot into flash kernel immediately, run:" +echo +echo -e " sudo grub-reboot \"Advanced options for Ubuntu>$(echo "$FLASH_KERNEL")\"" +echo -e " sudo reboot now" +echo From 4601cc6d518a8dbc3f99be1734249434f03fa8c8 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 29 Oct 2025 18:17:52 +0530 Subject: [PATCH 29/43] docs: updated FLASH kernel documentation --- README.md | 62 ++++++-- doc/flash_kernel/flash_kernel.md | 248 ++++++++++++++++++++++++++----- 2 files changed, 259 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index da6e1ad..c9adb46 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,56 @@ -# FLASH Userspace Library +# FLASH: Fast Linked AF_XDP Sockets for High Performance Network Services - + [![Commitizen friendly](https://img.shields.io/badge/commitizen-friendly-brightgreen.svg)](http://commitizen.github.io/cz-cli/) ![Ubuntu 24.04](https://github.com/rickydebojeet/flash/actions/workflows/main.yml/badge.svg) -FLASH: Fast Linked AF_XDP Sockets for High Performance Network Services +FLASH is a high-speed userspace library that makes it easy to build efficient, unprivileged AF_XDP applications for modern cloud and edge deployments. -A userspace library that lets you link isolated unprivileged AF_XDP network functions to boost performance using FLASH out-of-tree kernel. It’s also great for deploying network functions in containers without needing a custom kernel, but without chaining support. +Seamlessly integrated with the **FLASH kernel**, it extends AF_XDP to enable true zero-copy packet sharing between network functions (NFs) and network devices, unlocking performance that surpasses traditional AF_XDP chaining solutions. -## Baremetal Usage Instructions +## Key Features +- **Zero-Copy Packet Sharing**: Unlock unparalleled throughput and minimal latency with zero-copy data paths between NFs and network devices. +- **Unprivileged Operation**: Run AF_XDP applications securely without root access simplifying deployment while maintaining isolation. +- **Packet Isolation**: Ensure strong packet-level isolation between NFs, even when sharing memory powered by Rust and FLASH kernel safeguards. +- **Backward Compatibility**: Chain existing AF_XDP applications in copy-based mode with no code changes — easy migration, no disruption. +- **Flexible Deployment Options**: Deploy seamlessly on bare metal or in containers for consistent, isolated environments. Works on standard Linux kernels too (without zero-copy chaining support). +- **Multi tenant Support**: Designed for shared environments — the OS remains in control of resources, ensuring safety and fairness when multiple users or tenants share the same host. Unlike DPDK, FLASH plays nicely in multi-tenant and cloud-native setups. -Baremetal Deployment has been tested on Ubuntu 24.04 hosts and is expected to function similarly on Ubuntu 22.04 hosts. However, it may encounter build failures on Ubuntu 20.04 and older versions due to the absence of necessary libraries in the apt repository. -For standalone NFs operations, Linux kernel versions 5.17.5 and later are recommended. +## Getting Started -### Building +Clone the repositories and install the FLASH kernel for zero-copy chaining support. +```bash +git clone https://github.com/networkedsystemsIITB/flash.git +git clone https://github.com/networkedsystemsIITB/flash-linux.git +cd flash +sudo ./usertools/flash_kernel/install.sh ../flash-linux +``` + +The `install.sh` script will build and install the kernel along with its modules. +It requires the path to the flash-linux repository as the first argument. +An optional second argument can be provided to specify the number of processors to use during the build. + +During execution, the script will prompt you to choose between a quick build and a full build. Select the quick build option for faster compilation. + +Follow the on-screen instructions provided in the terminal after installation to boot into the FLASH kernel. + +For more details, refer to the [FLASH Kernel Guide](./doc/flash_kernel/flash_kernel.md). + +### Building the userspace library and examples + +FLASH has been tested on Ubuntu 24.04 and is expected to work similarly on Ubuntu 22.04. +Older versions (e.g., Ubuntu 20.04 or earlier) may encounter build issues due to missing dependencies. -The library is built on top of libbpf and libxdp. You can install the dependencies using the following commands: +> Recommended Kernel Version: 5.17.5 or later for standalone NF operations. + +Install the required dependencies using the following command: ```bash -sudo apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev +sudo apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev libnuma-dev ``` -The libxdp library is not available in the Ubuntu repositories. You can build it from source using the following commands: +`libxdp` is not included in Ubuntu repositories — build it from source: ```bash git clone https://github.com/xdp-project/xdp-tools.git @@ -30,7 +58,13 @@ make -j -C xdp-tools libxdp sudo PREFIX=/usr make -j -C xdp-tools libxdp_install ``` -Once you have installed the dependencies, you can build the library using the following commands: +Install Rust for using the Rust components of FLASH: + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +Once dependencies are ready, build the library and examples: ```bash make @@ -83,7 +117,3 @@ You can also use docker compose to deploy multiple NFs at the same time. ```bash docker compose up -d ``` - -### Chaining NFs using FLASH Monitor - -To chain NFs you need to install a custom out-of-tree kernel. Checkout the instructions [here](./doc/flash_kernel/flash_kernel.rst). diff --git a/doc/flash_kernel/flash_kernel.md b/doc/flash_kernel/flash_kernel.md index babbab9..a942eed 100644 --- a/doc/flash_kernel/flash_kernel.md +++ b/doc/flash_kernel/flash_kernel.md @@ -1,72 +1,250 @@ - + -# Introduction +# FLASH Kernel Guide -This documents describes how to install flash linux out-of-tree kernel in a system. +The **FLASH Kernel** extends the Linux **AF_XDP** subsystem to provide zero-copy packet redirection and efficient in-kernel data paths for high-performance user-space networking. -## System Requirements and Building flash kernel +It introduces: +- A high-speed, in-kernel redirection mechanism between AF_XDP sockets. +- A sysfs control interface for managing AF_XDP sockets securely from user space. -flash kernel is based on linux kernel v6.10.6. This documentation assumes that you are building and installing the kernel on a -Ubuntu 24.04 LTS host. Other distributions may work, but this documentation assumes Ubuntu. +#### Quick Links +⚙️ [Kernel Installation](#kernel-installation) +🧩 [Sysfs Interface](#sysfs-interface-for-af_xdp-socket-management) +🔁 [Interrupt vs. Busy-Polling (`poll()` Usage)](#using-poll-for-interrupt-vs-busy-polling-mode) +🧠 [Backpressure Handling (`poll()` & `recvfrom()`)](#backpressure-handling-with-poll-and-recvfrom) +🚦 [TX Tracking per Flow (HOL Mitigation)](#tx-tracking-per-flow-mitigating-head-of-line-blocking) +🧹 [Uninstalling the FLASH Kernel](#uninstalling-the-flash-kernel) -## Clone the flash kernel repository -``` -git clone https://github.com/rickydebojeet/linux.git -``` +## System Requirements -Make sure to note the path to the flash kernel repository. This path will be used in the next steps. +The FLASH kernel is based on Linux kernel v6.10.6. This documentation assumes that you are building and installing the kernel on an Ubuntu host. Other distributions may work, but this documentation assumes Ubuntu. -## Install the kernel automatically +We have tested the FLASH kernel on the following Ubuntu versions: +- Ubuntu 22.04 LTS +- Ubuntu 24.04 LTS +- Ubuntu 25.04 +- Ubuntu 25.10 -Use the following steps to install the kernel automatically. +> **Note:** Ubuntu 25.10 and newer ship with **GCC 15**, which is incompatible with the FLASH kernel. +> Install GCC 14 before building to avoid compilation errors. +## Kernel Installation + +### Automated Installation (Recommended) + +You can use the `install.sh` script to build and install the FLASH kernel automatically. + +```bash +git clone https://github.com/networkedsystemsIITB/flash.git +git clone https://github.com/networkedsystemsIITB/flash-linux.git +cd flash +sudo ./usertools/flash_kernel/install.sh ../flash-linux ``` -sudo ./usertools/flash_kernel/install.sh -cp -v /boot/config-$(uname -r) .config -(yes "" || true) | make localmodconfig +#### Build the kernel: + +```bash +git clone https://github.com/networkedsystemsIITB/flash-linux.git +cd flash-linux +make olddefconfig scripts/config --disable SYSTEM_TRUSTED_KEYS scripts/config --disable SYSTEM_REVOCATION_KEYS scripts/config --set-str CONFIG_SYSTEM_TRUSTED_KEYS "" scripts/config --set-str CONFIG_SYSTEM_REVOCATION_KEYS "" ``` -Now you can build the kernel: +For advanced configuration, see the [Linux kernel build guide](https://www.kernel.org/doc/Documentation/admin-guide/README.rst). -``` +Now build: + +```bash make -j$(nproc) ``` -After the kernel is built, you can install it: +#### Install the kernel: -``` +```bash sudo make modules_install sudo make install ``` -After the kernel is installed, you just need to reboot the system: +After the kernel is installed, you just need to reboot the system and select the FLASH kernel from the GRUB menu: -``` +```bash sudo reboot -``` \ No newline at end of file +``` + +After rebooting, you can verify that the FLASH kernel is running by executing: + +```bash +uname -r +``` + +## FLASH Kernel Features + +> This section is intended for developers building network frameworks or libraries on top of the FLASH kernel. +> Regular users should rely on the FLASH userspace library, which abstracts these details automatically. + +### Sysfs Interface for AF_XDP Socket Management + +FLASH kernel exposes a sysfs interface under `/sys/kernel/flash` allowing privileged users to: +- Inspect active AF_XDP sockets +- CConfigure redirection rules between sockets +- Adjust per-socket parameters (e.g., TX tracking) + +Each AF_XDP socket is identified by a process-independent identifier called a flash-id, which enables cross-process management. + +When a new AF_XDP socket is created, FLASH automatically registers it in sysfs under a dedicated directory: `/sys/kernel/flash//` + +Below is an example sysfs layout for three sockets with flash-ids 1, 2, and 3: + +```bash +/sys/kernel/flash/ +│ +├── tx_tracking # Global TX tracking control (0 or 1) +│ +├── 1/ +│ ├── pid # Process ID owning this socket +│ ├── procname # Process name +│ ├── ifindex # Network interface index +│ ├── qid # Queue ID +│ └── next # Redirection targets +│ +├── 2/ +│ └── ... +│ +└── 3/ + └── ... +``` + +Socket directories are dynamically created when a socket is registered and automatically removed upon closure. + +#### Configuring Redirections + +The `next` file defines downstream paths for packet redirection. + +- A value of -1 means the socket transmits packets directly to the NIC (no redirection). +- Writing one or more flash-ids to this file defines new downstream redirection targets. + +**Examples:** + +a. Redirect socket 1 → socket 2: + +```console +# cat /sys/kernel/flash/1/next +-1 +# echo 2 | sudo tee /sys/kernel/flash/1/next +# cat /sys/kernel/flash/1/next +index flash_id +0 2 +``` + +Redirect socket 1 → sockets 2 and 3: + +```console +# cat /sys/kernel/flash/1/next +-1 +# echo "2 3" | sudo tee /sys/kernel/flash/1/next +# cat /sys/kernel/flash/1/next +index flash_id +0 2 +1 3 +``` + +Clear all redirections: + +```console +# echo "-1" | sudo tee /sys/kernel/flash/1/next +# cat /sys/kernel/flash/1/next +-1 +``` + +#### Runtime redirection semantics + +When sending packets from an AF_XDP socket: + +- Single target: All packets are forwarded to that target. +- Multiple targets: The lower 16 bits of the packet descriptor’s flags field determine the destination index (default = 0). [index is the value written in the `next` file] + +The redirection automatically happens in zero-copy if the sockets share UMEM. If the sockets do not share UMEM, FLASH falls back to copying packets between sockets. + +From user-space, this behavior is transparent; sending to redirected sockets behaves as with regular AF_XDP sockets. + +### Using `poll()` for Interrupt vs. Busy-Polling Mode + +AF_XDP sockets can operate in both interrupt-driven and busy-polling modes, allowing applications to balance CPU utilization and latency depending on workload characteristics. + +Recommended workflow: + +1. **Start in interrupt mode:** Use `poll()` with the `POLLIN` flag to block until packets arrive. +2. **Switch to busy-polling:** When packet rates are consistently high, switch to a busy loop using `recvfrom()` to continuously process packets. This eliminates interrupt latency and can improve throughput. +3. **Revert to interrupt mode:** When load decreases, return to interrupt mode to save CPU cycles. + +FLASH ensures that `poll()` correctly reflects packet readiness even when redirection chains are configured, enabling seamless transitions between modes without losing events or packets. + +### Backpressure Handling with `poll()` and `recvfrom()` + +Backpressure arises when downstream sockets (receivers) cannot process packets as fast as they are being produced. The FLASH Kernel introduces natural backpressure into the AF_XDP data path via the TX and CQ rings: packets are not transmitted to downstream sockets if their RX rings are full. + +In this scenario, the sender must retry transmissions untill space becomes available. To avoid wasting CPU cycles through busy-waiting, applications should rely on the following readiness mechanism: + +- When congestion is detected on a sender socket, use `poll()` with the `POLLOUT` flag to sleep until the socket is ready to send again. +- Receiver sockets should use `recvfrom()` with the `MSG_MORE` flag to implicitly signal the sender once they have freed space. + +The FLASH kernel ensure that the signal from the receiver propagates upstream through the redirection chain, waking up any blocked senders. This cooperative signaling model helps maintain steady throughput while preventing packet loss or excessive CPU usage under heavy load. + +### TX Tracking per Flow (Mitigating Head-of-Line Blocking) + +When multiple downstream sockets are configured for redirection, one slow or congested target can cause head-of-line (HOL) blocking, where faster flows are stalled by slower ones. To mitigate this, FLASH provides a global TX tracking mechanism, which can be enabled or disabled through the `tx_tracking` sysfs file. + +```bash +echo 1 | sudo tee /sys/kernel/flash/tx_tracking # Enable TX tracking +echo 0 | sudo tee /sys/kernel/flash/tx_tracking # Disable TX tracking +``` + +When enabled, FLASH tracks packet transmission status on a per-flow basis. +As packets are transmitted succesfully, the kernel writes back the `flash_id` of the downstream socket into the memory location specified by the packet descriptor before returning it to the completion queue. This allows user-space applications to identify which downstream path successfully transmitted each packet. + +Applications can use this feedback to implement: +- Dynamic congestion control or flow rerouting, +- Per-destination pacing or fair queuing, and +- Custom recovery strategies to reduce the impact of HOL blocking. + +> Note: TX tracking adds slight overhead due to per-flow bookkeeping. It is recommended for applications that require advanced flow management or fairness across multiple redirection targets. + +## Uninstalling the FLASH Kernel + +You can use the `uninstall.sh` script to remove the FLASH kernel and restore the previous kernel. + +```bash +cd flash +sudo ./usertools/flash_kernel/uninstall.sh +``` + +The script will ask for kernel version to uninstall and will update the GRUB configuration accordingly. After uninstalling, reboot the system to boot into the previous kernel. + +> Note: Make sure that the previous kernel is still installed on your system before uninstalling the FLASH kernel. From a9ba5215c29b044dd5aae6ff3b84b0813a9538be Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 29 Oct 2025 18:22:51 +0530 Subject: [PATCH 30/43] docs: fixed typos --- doc/flash_kernel/flash_kernel.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/flash_kernel/flash_kernel.md b/doc/flash_kernel/flash_kernel.md index a942eed..4a40c6c 100644 --- a/doc/flash_kernel/flash_kernel.md +++ b/doc/flash_kernel/flash_kernel.md @@ -163,7 +163,7 @@ index flash_id 0 2 ``` -Redirect socket 1 → sockets 2 and 3: +b. Redirect socket 1 → sockets 2 and 3: ```console # cat /sys/kernel/flash/1/next @@ -175,7 +175,7 @@ index flash_id 1 3 ``` -Clear all redirections: +c. Clear all redirections: ```console # echo "-1" | sudo tee /sys/kernel/flash/1/next From 30959b6cfc4b2c2e9bb996be6fd3122a897e1414 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Wed, 29 Oct 2025 18:23:54 +0530 Subject: [PATCH 31/43] docs: updated terminologies --- doc/flash_kernel/flash_kernel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/flash_kernel/flash_kernel.md b/doc/flash_kernel/flash_kernel.md index 4a40c6c..52603b6 100644 --- a/doc/flash_kernel/flash_kernel.md +++ b/doc/flash_kernel/flash_kernel.md @@ -188,7 +188,7 @@ c. Clear all redirections: When sending packets from an AF_XDP socket: - Single target: All packets are forwarded to that target. -- Multiple targets: The lower 16 bits of the packet descriptor’s flags field determine the destination index (default = 0). [index is the value written in the `next` file] +- Multiple targets: The lower 16 bits of the packet descriptor’s flags field determine the destination index (default = 0). [index is the value shown in the `next` file] The redirection automatically happens in zero-copy if the sockets share UMEM. If the sockets do not share UMEM, FLASH falls back to copying packets between sockets. From 344ae6eaf35fe3b21d8a6ca79daad540dc7c295b Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 13:06:05 +0530 Subject: [PATCH 32/43] docs: added driver documentation --- doc/flash_kernel/flash_kernel.md | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/doc/flash_kernel/flash_kernel.md b/doc/flash_kernel/flash_kernel.md index 52603b6..0d44556 100644 --- a/doc/flash_kernel/flash_kernel.md +++ b/doc/flash_kernel/flash_kernel.md @@ -14,6 +14,7 @@ It introduces: 🔁 [Interrupt vs. Busy-Polling (`poll()` Usage)](#using-poll-for-interrupt-vs-busy-polling-mode) 🧠 [Backpressure Handling (`poll()` & `recvfrom()`)](#backpressure-handling-with-poll-and-recvfrom) 🚦 [TX Tracking per Flow (HOL Mitigation)](#tx-tracking-per-flow-mitigating-head-of-line-blocking) +🧰 [Adding Driver Support](#adding-driver-support) 🧹 [Uninstalling the FLASH Kernel](#uninstalling-the-flash-kernel) @@ -30,6 +31,12 @@ We have tested the FLASH kernel on the following Ubuntu versions: > **Note:** Ubuntu 25.10 and newer ship with **GCC 15**, which is incompatible with the FLASH kernel. > Install GCC 14 before building to avoid compilation errors. +The zero-copy redirection feature requires some support from NIC driver. Currently, the following the NIC drivers are supported: +- Intel `ixgbe` driver (10GbE) +- Intel `i40e` driver (40GbE) +- Intel `ice` driver (100GbE and above) +- Mellanox `mlx5` driver (10GbE and above) + ## Kernel Installation ### Automated Installation (Recommended) @@ -236,6 +243,89 @@ Applications can use this feedback to implement: > Note: TX tracking adds slight overhead due to per-flow bookkeeping. It is recommended for applications that require advanced flow management or fairness across multiple redirection targets. +## Adding Driver Support + +To enable zero-copy redirection for an AF_XDP-supported NIC, the driver must be updated slightly to support the FLASH kernel’s redirection mechanism. + +Packet redirection in FLASH occurs when packets are transmitted from an AF_XDP socket. +Depending on the driver implementation, this is typically handled using one of the following APIs: +- `xsk_tx_peek_desc()` and `xsk_tx_release()` APIs used by most standard AF_XDP drivers +- `xsk_tx_peek_release_desc_batch()` API used by batch-oriented drivers for higher throughput + +### Supporting FLASH with `xsk_tx_peek_desc()` and `xsk_tx_release()` + +In the standard AF_XDP transmission flow: +1. The driver calls `xsk_tx_peek_desc()` to fetch a descriptor for transmission. +2. The NAPI TX poll function collects all such descriptors into a batch and transmits them. +3. The TX ring is released after the batch using `xsk_tx_release()`. + +If no descriptor is returned by `xsk_tx_peek_desc()`, the driver stops processing further descriptors for transmission. + +In the FLASH kernel, when redirection is configured, the driver must not transmit packets to the NIC, but it should continue processing all remaining descriptors. + +To achieve this, the driver should check a flag in the AF_XDP socket’s pool structure: +`pool->no_tx_out` — this boolean flag is set when redirection is active. + +**Example: Transmission Function with FLASH Support** + +```c +bool xmit(struct xsk_buff_pool *pool, unsigned int budget) +{ + struct xdp_desc desc; + + while (budget-- > 0) { + // Fetch a descriptor for transmission + if (!xsk_tx_peek_desc(pool, &desc)) + break; + + // If redirection is configured, skip NIC transmission + if (pool->no_tx_out) + continue; + + // Proceed with normal transmission + } + + // After processing all descriptors, trigger redirection if needed + if (pool->no_tx_out) + xsk_tx_release(pool); + + return !budget; +} +``` + +### Supporting FLASH with `xsk_tx_peek_release_desc_batch()` + +Drivers that use batch-oriented transmission can integrate FLASH support similarly. + +In the batch transmission flow: +1. The driver calls `xsk_tx_peek_release_desc_batch()` to fetch and release a batch of transmission descriptors. +2. The NIC driver then transmits the corresponding packets. + +In the FLASH kernel, when redirection is configured, the driver should skip NIC transmission but still process all descriptors by calling `xsk_tx_peek_release_desc_batch()`. +The same `pool->no_tx_out` flag applies in this case as well. + +Example: Batch Transmission Function with FLASH Support + +```c +bool xmit_batch(struct xsk_buff_pool *pool, unsigned int budget) +{ + struct xdp_desc *descs = pool->tx_descs; + unsigned int nb_pkts = 0; + + // Fetch a batch of descriptors for transmission + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); + if (!nb_pkts) + return true; + + // If redirection is configured, skip NIC transmission + if (pool->no_tx_out) + return nb_pkts < budget; + + // Proceed with normal transmission + return nb_pkts < budget; +} +``` + ## Uninstalling the FLASH Kernel You can use the `uninstall.sh` script to remove the FLASH kernel and restore the previous kernel. From ec9178a250f6047ec66de8cc901ac8be5c9b31ba Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Thu, 30 Oct 2025 13:59:46 +0530 Subject: [PATCH 33/43] feat(rust): added tui for stats rust library: - refactored stats module - added tui - added msrv to Cargo.toml - updated repo in Cargo.toml - chained if lets - reset idle timestamp after bp timeout - added minor optimizations and refactors - fixed/allowed clippy warnings rust examples: - added stats and tui - made ctrlc optional but default - updated nf mac and dest mac logic - preallocated vectors for send and drop descs --- examples/arpresolver-rs/Cargo.toml | 5 +- examples/arpresolver-rs/src/cli.rs | 35 ++++- examples/arpresolver-rs/src/main.rs | 88 +++++++++-- examples/arpresolver-rs/src/nf.rs | 10 +- examples/firewall-rs/Cargo.toml | 5 +- examples/firewall-rs/src/cli.rs | 28 ++++ examples/firewall-rs/src/main.rs | 76 +++++++-- examples/firewall-rs/src/nf.rs | 33 +--- examples/helloworld-rs/src/main.rs | 2 +- examples/ip4ping-rs/Cargo.toml | 6 +- examples/ip4ping-rs/src/cli.rs | 32 ++++ examples/ip4ping-rs/src/main.rs | 81 ++++++++-- examples/l2fwd-rs/Cargo.toml | 5 +- examples/l2fwd-rs/src/cli.rs | 28 ++++ examples/l2fwd-rs/src/main.rs | 72 +++++++-- examples/maglev-rs/Cargo.toml | 5 +- examples/maglev-rs/src/cli.rs | 33 ++++ examples/maglev-rs/src/main.rs | 108 +++++++++---- examples/maglev-rs/src/nf.rs | 16 +- examples/simplefwd-rs/Cargo.toml | 6 +- examples/simplefwd-rs/src/cli.rs | 32 ++++ examples/simplefwd-rs/src/main.rs | 82 ++++++++-- lib/flash-rs/Cargo.toml | 6 +- lib/flash-rs/src/client.rs | 32 ++-- .../config/{config_noclap.rs => config.rs} | 0 lib/flash-rs/src/config/mod.rs | 22 +-- lib/flash-rs/src/error.rs | 4 +- lib/flash-rs/src/fd/fd.rs | 7 +- lib/flash-rs/src/fd/mod.rs | 4 +- lib/flash-rs/src/lib.rs | 8 +- lib/flash-rs/src/mem/desc.rs | 2 +- lib/flash-rs/src/mem/pool.rs | 13 +- lib/flash-rs/src/mem/ring/mod.rs | 5 +- lib/flash-rs/src/stats/mod.rs | 5 + lib/flash-rs/src/{ => stats}/stats.rs | 46 ++---- lib/flash-rs/src/stats/sub.rs | 32 ++++ lib/flash-rs/src/tui/dashboard.rs | 147 ++++++++++++++++++ lib/flash-rs/src/tui/error.rs | 12 ++ lib/flash-rs/src/tui/layout.rs | 110 +++++++++++++ lib/flash-rs/src/tui/layout_str.rs | 50 ++++++ lib/flash-rs/src/tui/mod.rs | 13 ++ lib/flash-rs/src/tui/panel.rs | 91 +++++++++++ lib/flash-rs/src/tui/widget/app.rs | 90 +++++++++++ lib/flash-rs/src/tui/widget/meta.rs | 36 +++++ lib/flash-rs/src/tui/widget/mod.rs | 25 +++ lib/flash-rs/src/tui/widget/ring.rs | 60 +++++++ lib/flash-rs/src/tui/widget/xdp.rs | 95 +++++++++++ lib/flash-rs/src/xsk/mod.rs | 3 +- lib/flash-rs/src/xsk/socket.rs | 63 ++++---- 49 files changed, 1503 insertions(+), 266 deletions(-) rename lib/flash-rs/src/config/{config_noclap.rs => config.rs} (100%) create mode 100644 lib/flash-rs/src/stats/mod.rs rename lib/flash-rs/src/{ => stats}/stats.rs (54%) create mode 100644 lib/flash-rs/src/stats/sub.rs create mode 100644 lib/flash-rs/src/tui/dashboard.rs create mode 100644 lib/flash-rs/src/tui/error.rs create mode 100644 lib/flash-rs/src/tui/layout.rs create mode 100644 lib/flash-rs/src/tui/layout_str.rs create mode 100644 lib/flash-rs/src/tui/mod.rs create mode 100644 lib/flash-rs/src/tui/panel.rs create mode 100644 lib/flash-rs/src/tui/widget/app.rs create mode 100644 lib/flash-rs/src/tui/widget/meta.rs create mode 100644 lib/flash-rs/src/tui/widget/mod.rs create mode 100644 lib/flash-rs/src/tui/widget/ring.rs create mode 100644 lib/flash-rs/src/tui/widget/xdp.rs diff --git a/examples/arpresolver-rs/Cargo.toml b/examples/arpresolver-rs/Cargo.toml index 52ee173..fd6fd84 100644 --- a/examples/arpresolver-rs/Cargo.toml +++ b/examples/arpresolver-rs/Cargo.toml @@ -6,14 +6,15 @@ edition = "2024" [dependencies] clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } macaddr = "1.0.1" tracing = { version = "0.1.41", optional = true } tracing-subscriber = { version = "0.3.19", optional = true } [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/arpresolver-rs/src/cli.rs b/examples/arpresolver-rs/src/cli.rs index c8dbe04..00e29e8 100644 --- a/examples/arpresolver-rs/src/cli.rs +++ b/examples/arpresolver-rs/src/cli.rs @@ -1,7 +1,13 @@ +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::GridLayout; + #[derive(Debug, Parser)] pub struct Cli { #[command(flatten)] @@ -23,6 +29,31 @@ pub struct Cli { )] pub cpu_end: usize, - #[arg(short = 'm', long, help = "Interface MAC address")] - pub mac_addr: MacAddr6, + #[arg(short = 'M', long, help = "NF MAC address")] + pub nf_mac: MacAddr6, + + #[arg(short = 'm', long, help = "Dest MAC address")] + pub mac_addr: Option, + + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, +} + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, } diff --git a/examples/arpresolver-rs/src/main.rs b/examples/arpresolver-rs/src/main.rs index cb6ee75..fa8821a 100644 --- a/examples/arpresolver-rs/src/main.rs +++ b/examples/arpresolver-rs/src/main.rs @@ -14,9 +14,18 @@ use clap::Parser; use flash::Socket; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; + use crate::cli::Cli; -fn socket_thread(mut socket: Socket, mac_addr: MacAddr6, ip_addr: Ipv4Addr, run: &Arc) { +fn socket_thread( + mut socket: Socket, + nf_mac: MacAddr6, + nf_ip: Ipv4Addr, + mac_addr: Option, + run: &Arc, +) { while run.load(Ordering::SeqCst) { if !socket.poll().is_ok_and(|val| val) { continue; @@ -26,11 +35,25 @@ fn socket_thread(mut socket: Socket, mac_addr: MacAddr6, ip_addr: Ipv4Addr, run: continue; }; - let (descs_send, descs_drop) = descs.into_iter().partition(|desc| { - socket - .read_exact(desc) - .is_ok_and(|pkt| nf::arp_resolve(pkt, mac_addr, ip_addr)) - }); + let mut descs_send = Vec::with_capacity(descs.len()); + let mut descs_drop = Vec::with_capacity(descs.len()); + + for mut desc in descs { + let Ok(pkt) = socket.read_exact(&desc) else { + descs_drop.push(desc); + continue; + }; + + if nf::arp_resolve(pkt, nf_mac, nf_ip) { + desc.set_next(1); + } + + if let Some(mac_addr) = mac_addr { + pkt[0..6].copy_from_slice(mac_addr.as_bytes()); + } + + descs_send.push(desc); + } socket.send(descs_send); socket.drop(descs_drop); @@ -59,6 +82,19 @@ fn main() { #[cfg(feature = "tracing")] tracing::debug!("Sockets: {:?}", sockets); + #[cfg(feature = "stats")] + let mut tui = match StatsDashboard::new( + sockets.iter().map(Socket::stats), + cli.stats.fps, + cli.stats.layout, + ) { + Ok(t) => t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + let cores = core_affinity::get_core_ids() .unwrap_or_default() .into_iter() @@ -66,19 +102,32 @@ fn main() { .collect::>(); if cores.is_empty() { - eprintln!("No cores found in range {}-{}", cli.cpu_start, cli.cpu_end); + eprintln!("no cores found in range {}-{}", cli.cpu_start, cli.cpu_end); return; } #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -90,11 +139,26 @@ fn main() { let r = run.clone(); thread::spawn(move || { core_affinity::set_for_current(core_id); - socket_thread(socket, cli.mac_addr, route.ip_addr, &r); + socket_thread(socket, cli.nf_mac, route.ip_addr, cli.mac_addr, &r); }) }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/examples/arpresolver-rs/src/nf.rs b/examples/arpresolver-rs/src/nf.rs index 15e2c95..5e4f0b7 100644 --- a/examples/arpresolver-rs/src/nf.rs +++ b/examples/arpresolver-rs/src/nf.rs @@ -13,7 +13,7 @@ const ARP_OPCODE_REPLY: u16 = 2; #[forbid(clippy::indexing_slicing)] #[inline] -pub fn arp_resolve(pkt: &mut [u8; 42], mac_addr: MacAddr6, ip_addr: Ipv4Addr) -> bool { +pub fn arp_resolve(pkt: &mut [u8; 42], nf_addr: MacAddr6, nf_ip: Ipv4Addr) -> bool { if u16::from_be_bytes([pkt[12], pkt[13]]) != ETHER_TYPE_ARP || u16::from_be_bytes([pkt[14], pkt[15]]) != ARP_HTYPE_ETHERNET || u16::from_be_bytes([pkt[16], pkt[17]]) != ARP_PTYPE_IPV4 @@ -24,18 +24,18 @@ pub fn arp_resolve(pkt: &mut [u8; 42], mac_addr: MacAddr6, ip_addr: Ipv4Addr) -> return false; } - if pkt[38..42] != ip_addr.octets() { + if pkt[38..42] != nf_ip.octets() { return false; } let mut tmp = [0u8; 6]; - tmp.copy_from_slice(&pkt[6..12]); + pkt[0..6].copy_from_slice(&tmp); pkt[32..38].copy_from_slice(&tmp); - pkt[6..12].copy_from_slice(&mac_addr.into_array()); - pkt[22..28].copy_from_slice(&mac_addr.into_array()); + pkt[6..12].copy_from_slice(&nf_addr.into_array()); + pkt[22..28].copy_from_slice(&nf_addr.into_array()); pkt[20..22].copy_from_slice(&ARP_OPCODE_REPLY.to_be_bytes()); diff --git a/examples/firewall-rs/Cargo.toml b/examples/firewall-rs/Cargo.toml index e031f77..db462fb 100644 --- a/examples/firewall-rs/Cargo.toml +++ b/examples/firewall-rs/Cargo.toml @@ -7,7 +7,7 @@ edition = "2024" clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" csv = "1.3.1" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } macaddr = "1.0.1" serde = { version = "1.0.219", features = ["derive"] } @@ -15,7 +15,8 @@ tracing = { version = "0.1.41", optional = true } tracing-subscriber = { version = "0.3.19", optional = true } [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/firewall-rs/src/cli.rs b/examples/firewall-rs/src/cli.rs index 6b0bffb..7123c6d 100644 --- a/examples/firewall-rs/src/cli.rs +++ b/examples/firewall-rs/src/cli.rs @@ -1,9 +1,15 @@ use std::path::PathBuf; +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::GridLayout; + #[derive(Debug, Parser)] pub struct Cli { #[command(flatten)] @@ -30,4 +36,26 @@ pub struct Cli { #[arg(short = 'm', long, help = "Dest MAC address")] pub mac_addr: Option, + + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, +} + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, } diff --git a/examples/firewall-rs/src/main.rs b/examples/firewall-rs/src/main.rs index 165b204..c042e5a 100644 --- a/examples/firewall-rs/src/main.rs +++ b/examples/firewall-rs/src/main.rs @@ -13,6 +13,9 @@ use clap::Parser; use flash::Socket; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; + use crate::{cli::Cli, nf::Firewall}; fn socket_thread( @@ -30,11 +33,22 @@ fn socket_thread( continue; }; - let (descs_send, descs_drop) = descs.into_iter().partition(|desc| { - socket - .read_exact(desc) - .is_ok_and(|pkt| nf::firewall_filter(pkt, firewall, mac_addr)) - }); + let mut descs_send = Vec::with_capacity(descs.len()); + let mut descs_drop = Vec::with_capacity(descs.len()); + + for desc in descs { + if let Ok(pkt) = socket.read_exact(&desc) + && nf::firewall_filter(firewall, pkt) + { + if let Some(mac_addr) = mac_addr { + pkt[0..6].copy_from_slice(mac_addr.as_bytes()); + } + + descs_send.push(desc); + } else { + descs_drop.push(desc); + } + } socket.send(descs_send); socket.drop(descs_drop); @@ -60,6 +74,22 @@ fn main() { return; } + #[cfg(feature = "tracing")] + tracing::debug!("Sockets: {:?}", sockets); + + #[cfg(feature = "stats")] + let mut tui = match StatsDashboard::new( + sockets.iter().map(Socket::stats), + cli.stats.fps, + cli.stats.layout, + ) { + Ok(t) => t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + let firewall = match Firewall::new(cli.denylist) { Ok(firewall) => Arc::new(firewall), Err(err) => { @@ -82,12 +112,25 @@ fn main() { #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -106,6 +149,21 @@ fn main() { }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/examples/firewall-rs/src/nf.rs b/examples/firewall-rs/src/nf.rs index b6cd47d..fe37d99 100644 --- a/examples/firewall-rs/src/nf.rs +++ b/examples/firewall-rs/src/nf.rs @@ -3,7 +3,6 @@ use std::{net::Ipv4Addr, path::Path}; use csv::Reader; -use macaddr::MacAddr6; use serde::{Deserialize, de}; const ETHER_TYPE_IPV4: u16 = 0x0800; @@ -78,30 +77,12 @@ impl Tuple5 { } #[inline] -pub fn firewall_filter( - pkt: &mut [u8; 54], - firewall: &Firewall, - mac_addr: Option, -) -> bool { - if u16::from_be_bytes([pkt[12], pkt[13]]) != ETHER_TYPE_IPV4 { - return false; +pub fn firewall_filter(firewall: &Firewall, pkt: &mut [u8; 54]) -> bool { + if u16::from_be_bytes([pkt[12], pkt[13]]) == ETHER_TYPE_IPV4 + && let Some(tuple5) = Tuple5::new(pkt) + { + !firewall.blocked(&tuple5) + } else { + false } - - let Some(tuple5) = Tuple5::new(pkt) else { - return false; - }; - - if firewall.blocked(&tuple5) { - return false; - } - - if let Some(mac_addr) = mac_addr { - let mut tmp = [0; 6]; - - tmp.copy_from_slice(&pkt[0..6]); - pkt[6..12].copy_from_slice(&tmp); - pkt[0..6].copy_from_slice(mac_addr.as_bytes()); - } - - true } diff --git a/examples/helloworld-rs/src/main.rs b/examples/helloworld-rs/src/main.rs index 16f6a87..3855497 100644 --- a/examples/helloworld-rs/src/main.rs +++ b/examples/helloworld-rs/src/main.rs @@ -22,5 +22,5 @@ fn main() { } #[cfg(feature = "tracing")] - tracing::info!("sockets: {sockets:?}"); + tracing::info!("Sockets: {sockets:?}"); } diff --git a/examples/ip4ping-rs/Cargo.toml b/examples/ip4ping-rs/Cargo.toml index 1d13a2b..0fd97e9 100644 --- a/examples/ip4ping-rs/Cargo.toml +++ b/examples/ip4ping-rs/Cargo.toml @@ -6,13 +6,15 @@ edition = "2024" [dependencies] clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } +macaddr = "1.0.1" tracing = { version = "0.1.41", optional = true } tracing-subscriber = { version = "0.3.19", optional = true } [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/ip4ping-rs/src/cli.rs b/examples/ip4ping-rs/src/cli.rs index 31aa504..9d5cf63 100644 --- a/examples/ip4ping-rs/src/cli.rs +++ b/examples/ip4ping-rs/src/cli.rs @@ -1,5 +1,12 @@ +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; +use macaddr::MacAddr6; + +#[cfg(feature = "stats")] +use flash::tui::GridLayout; #[derive(Debug, Parser)] pub struct Cli { @@ -21,4 +28,29 @@ pub struct Cli { help = "Ending CPU core index for socket threads (inclusive)" )] pub cpu_end: usize, + + #[arg(short = 'm', long, help = "Dest MAC address")] + pub mac_addr: Option, + + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, +} + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, } diff --git a/examples/ip4ping-rs/src/main.rs b/examples/ip4ping-rs/src/main.rs index b56fea5..0784743 100644 --- a/examples/ip4ping-rs/src/main.rs +++ b/examples/ip4ping-rs/src/main.rs @@ -11,10 +11,14 @@ use std::{ use clap::Parser; use flash::Socket; +use macaddr::MacAddr6; + +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; use crate::cli::Cli; -fn socket_thread(mut socket: Socket, run: &Arc) { +fn socket_thread(mut socket: Socket, mac_addr: Option, run: &Arc) { while run.load(Ordering::SeqCst) { if !socket.poll().is_ok_and(|val| val) { continue; @@ -24,9 +28,25 @@ fn socket_thread(mut socket: Socket, run: &Arc) { continue; }; - let (descs_send, descs_drop) = descs - .into_iter() - .partition(|desc| socket.read_exact(desc).is_ok_and(nf::echo_reply)); + let mut descs_send = Vec::with_capacity(descs.len()); + let mut descs_drop = Vec::with_capacity(descs.len()); + + for mut desc in descs { + let Ok(pkt) = socket.read_exact(&desc) else { + descs_drop.push(desc); + continue; + }; + + if nf::echo_reply(pkt) { + desc.set_next(1); + } + + if let Some(mac_addr) = mac_addr { + pkt[0..6].copy_from_slice(mac_addr.as_bytes()); + } + + descs_send.push(desc); + } socket.send(descs_send); socket.drop(descs_drop); @@ -55,6 +75,19 @@ fn main() { #[cfg(feature = "tracing")] tracing::debug!("Sockets: {:?}", sockets); + #[cfg(feature = "stats")] + let mut tui = match StatsDashboard::new( + sockets.iter().map(Socket::stats), + cli.stats.fps, + cli.stats.layout, + ) { + Ok(t) => t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + let cores = core_affinity::get_core_ids() .unwrap_or_default() .into_iter() @@ -62,19 +95,32 @@ fn main() { .collect::>(); if cores.is_empty() { - eprintln!("No cores found in range {}-{}", cli.cpu_start, cli.cpu_end); + eprintln!("no cores found in range {}-{}", cli.cpu_start, cli.cpu_end); return; } #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -86,11 +132,26 @@ fn main() { let r = run.clone(); thread::spawn(move || { core_affinity::set_for_current(core_id); - socket_thread(socket, &r); + socket_thread(socket, cli.mac_addr, &r); }) }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/examples/l2fwd-rs/Cargo.toml b/examples/l2fwd-rs/Cargo.toml index a670c9f..8412719 100644 --- a/examples/l2fwd-rs/Cargo.toml +++ b/examples/l2fwd-rs/Cargo.toml @@ -6,14 +6,15 @@ edition = "2024" [dependencies] clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } macaddr = "1.0.1" tracing = { version = "0.1.41", optional = true } tracing-subscriber = { version = "0.3.19", optional = true } [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/l2fwd-rs/src/cli.rs b/examples/l2fwd-rs/src/cli.rs index 97f0590..db57cb9 100644 --- a/examples/l2fwd-rs/src/cli.rs +++ b/examples/l2fwd-rs/src/cli.rs @@ -1,7 +1,13 @@ +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::GridLayout; + #[derive(Debug, Parser)] pub struct Cli { #[command(flatten)] @@ -23,6 +29,28 @@ pub struct Cli { )] pub cpu_end: usize, + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, + #[arg(short = 'm', long, help = "Dest MAC address")] pub mac_addr: Option, } + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, +} diff --git a/examples/l2fwd-rs/src/main.rs b/examples/l2fwd-rs/src/main.rs index 742ad29..9a1a138 100644 --- a/examples/l2fwd-rs/src/main.rs +++ b/examples/l2fwd-rs/src/main.rs @@ -12,6 +12,9 @@ use clap::Parser; use flash::Socket; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; + use crate::cli::Cli; #[forbid(clippy::indexing_slicing)] @@ -39,12 +42,18 @@ fn socket_thread(mut socket: Socket, mac_addr: Option, run: &Arc t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + let cores = core_affinity::get_core_ids() .unwrap_or_default() .into_iter() @@ -80,19 +102,32 @@ fn main() { .collect::>(); if cores.is_empty() { - eprintln!("No cores found in range {}-{}", cli.cpu_start, cli.cpu_end); + eprintln!("no cores found in range {}-{}", cli.cpu_start, cli.cpu_end); return; } #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -109,6 +144,21 @@ fn main() { }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/examples/maglev-rs/Cargo.toml b/examples/maglev-rs/Cargo.toml index 22255fa..0934e0b 100644 --- a/examples/maglev-rs/Cargo.toml +++ b/examples/maglev-rs/Cargo.toml @@ -6,7 +6,7 @@ edition = "2024" [dependencies] clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } fnv = "1.0.7" macaddr = "1.0.1" @@ -15,7 +15,8 @@ tracing-subscriber = { version = "0.3.19", optional = true } twox-hash = "2.1.0" [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/maglev-rs/src/cli.rs b/examples/maglev-rs/src/cli.rs index 236d699..84451d9 100644 --- a/examples/maglev-rs/src/cli.rs +++ b/examples/maglev-rs/src/cli.rs @@ -1,7 +1,15 @@ +use std::net::Ipv4Addr; + +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::GridLayout; + #[derive(Debug, Parser)] pub struct Cli { #[command(flatten)] @@ -23,6 +31,31 @@ pub struct Cli { )] pub cpu_end: usize, + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, + + #[arg(short = 'F', long, help = "Fallback IPv4 address")] + pub fallback_ip: Option, + #[arg(short = 'm', long, help = "Dest MAC address for next NFs")] pub next_mac: Vec, } + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, +} diff --git a/examples/maglev-rs/src/main.rs b/examples/maglev-rs/src/main.rs index 42a5707..1ae88f3 100644 --- a/examples/maglev-rs/src/main.rs +++ b/examples/maglev-rs/src/main.rs @@ -4,6 +4,7 @@ mod nf; use std::{ hash::BuildHasher, + net::Ipv4Addr, sync::{ Arc, atomic::{AtomicBool, Ordering}, @@ -12,10 +13,13 @@ use std::{ }; use clap::Parser; -use flash::{Route, Socket}; +use flash::Socket; use fnv::FnvBuildHasher; use macaddr::MacAddr6; +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; + use crate::{cli::Cli, maglev::Maglev}; const MAGLEV_TABLE_SIZE: usize = 65537; @@ -23,7 +27,7 @@ const MAGLEV_TABLE_SIZE: usize = 65537; fn socket_thread( mut socket: Socket, maglev: &Arc>, - route: &Arc, + next_ip: &Arc>, next_mac: &Arc>, run: &Arc, ) { @@ -37,16 +41,18 @@ fn socket_thread( }; let mut descs_send = Vec::with_capacity(descs.len()); - let mut descs_drop = Vec::new(); + let mut descs_drop = Vec::with_capacity(descs.len()); for mut desc in descs { - if let Ok(pkt) = socket.read_exact(&desc) { - if let Some(idx) = nf::load_balance(pkt, maglev, route, next_mac) { - desc.set_next(idx); - descs_send.push(desc); - } else { - descs_drop.push(desc); + if let Ok(pkt) = socket.read_exact(&desc) + && let Some(idx) = nf::load_balance(pkt, maglev, next_ip) + { + if let Some(next_mac) = next_mac.get(idx).or_else(|| next_mac.first()) { + pkt[0..6].copy_from_slice(next_mac.as_bytes()); } + + desc.set_next(idx); + descs_send.push(desc); } else { descs_drop.push(desc); } @@ -57,6 +63,7 @@ fn socket_thread( } } +#[allow(clippy::too_many_lines)] fn main() { #[cfg(feature = "tracing")] tracing_subscriber::fmt::init(); @@ -76,25 +83,44 @@ fn main() { return; } - if route.next.is_empty() { - eprintln!("empty route received"); - return; - } + #[cfg(feature = "tracing")] + tracing::debug!("Sockets: {:?}", sockets); + + #[cfg(feature = "stats")] + let mut tui = match StatsDashboard::new( + sockets.iter().map(Socket::stats), + cli.stats.fps, + cli.stats.layout, + ) { + Ok(t) => t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + + let next_ip = if route.next.is_empty() { + if let Some(fb_ip) = cli.fallback_ip { + vec![fb_ip] + } else { + eprintln!("empty route and no fallback IP configured"); + return; + } + } else { + route.next + }; - if cli.next_mac.len() > 1 && cli.next_mac.len() != route.next.len() { + if cli.next_mac.len() > 1 && cli.next_mac.len() != next_ip.len() { eprintln!( "number of next NF MACs ({}) does not match number of next NFs ({})", cli.next_mac.len(), - route.next.len() + next_ip.len() ); return; } - let maglev = Arc::new(Maglev::::new( - &route.next, - MAGLEV_TABLE_SIZE, - )); - let route = Arc::new(route); + let maglev = Arc::new(Maglev::::new(&next_ip, MAGLEV_TABLE_SIZE)); + let next_ip = Arc::new(next_ip); let next_mac = Arc::new(cli.next_mac); let cores = core_affinity::get_core_ids() @@ -111,12 +137,25 @@ fn main() { #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -127,16 +166,31 @@ fn main() { .map(|(socket, core_id)| { let r = run.clone(); let maglev = maglev.clone(); - let route = route.clone(); - let next_macs = next_mac.clone(); + let next_ip = next_ip.clone(); + let next_mac = next_mac.clone(); thread::spawn(move || { core_affinity::set_for_current(core_id); - socket_thread(socket, &maglev, &route, &next_macs, &r); + socket_thread(socket, &maglev, &next_ip, &next_mac, &r); }) }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/examples/maglev-rs/src/nf.rs b/examples/maglev-rs/src/nf.rs index bc80e0b..caf8f03 100644 --- a/examples/maglev-rs/src/nf.rs +++ b/examples/maglev-rs/src/nf.rs @@ -2,9 +2,6 @@ use std::{hash::BuildHasher, net::Ipv4Addr}; -use flash::Route; -use macaddr::MacAddr6; - use crate::maglev::Maglev; const ETHER_TYPE_IPV4: u16 = 0x0800; @@ -42,8 +39,7 @@ impl Tuple5 { pub fn load_balance( pkt: &mut [u8; 54], maglev: &Maglev, - route: &Route, - next_mac: &[MacAddr6], + next_ip: &[Ipv4Addr], ) -> Option { if u16::from_be_bytes([pkt[12], pkt[13]]) != ETHER_TYPE_IPV4 { return None; @@ -55,7 +51,7 @@ pub fn load_balance( // } let idx = maglev.lookup(&tuple5); - let next_ip = route.next.get(idx)?.octets(); + let next_ip = next_ip.get(idx)?.octets(); let mut csum = u32::from(!u16::from_be_bytes([pkt[24], pkt[25]])); csum = csum.wrapping_add(u32::from(u16::from_be_bytes([next_ip[0], next_ip[1]]))); @@ -71,13 +67,5 @@ pub fn load_balance( pkt[24..26].copy_from_slice(&(!(csum as u16)).to_be_bytes()); pkt[30..34].copy_from_slice(&next_ip); - if let Some(next_mac) = next_mac.get(idx).or_else(|| next_mac.first()) { - let mut tmp = [0; 6]; - tmp.copy_from_slice(&pkt[0..6]); - - pkt[6..12].copy_from_slice(&tmp); - pkt[0..6].copy_from_slice(next_mac.as_bytes()); - } - Some(idx) } diff --git a/examples/simplefwd-rs/Cargo.toml b/examples/simplefwd-rs/Cargo.toml index 1995e64..a3146a9 100644 --- a/examples/simplefwd-rs/Cargo.toml +++ b/examples/simplefwd-rs/Cargo.toml @@ -6,13 +6,15 @@ edition = "2024" [dependencies] clap = { version = "4.5.35", features = ["derive"] } core_affinity = "0.8.3" -ctrlc = "3.4.5" +ctrlc = { version = "3.4.5", optional = true } flash = { path = "../../lib/flash-rs", features = ["clap"] } +macaddr = "1.0.1" tracing = { version = "0.1.41", optional = true } tracing-subscriber = { version = "0.3.19", optional = true } [features] -default = [] +default = ["dep:ctrlc"] +stats = ["flash/stats", "flash/tui"] tracing = ["dep:tracing", "dep:tracing-subscriber", "flash/tracing"] [lints.rust] diff --git a/examples/simplefwd-rs/src/cli.rs b/examples/simplefwd-rs/src/cli.rs index 31aa504..db57cb9 100644 --- a/examples/simplefwd-rs/src/cli.rs +++ b/examples/simplefwd-rs/src/cli.rs @@ -1,5 +1,12 @@ +#[cfg(feature = "stats")] +use std::str::FromStr as _; + use clap::Parser; use flash::FlashConfig; +use macaddr::MacAddr6; + +#[cfg(feature = "stats")] +use flash::tui::GridLayout; #[derive(Debug, Parser)] pub struct Cli { @@ -21,4 +28,29 @@ pub struct Cli { help = "Ending CPU core index for socket threads (inclusive)" )] pub cpu_end: usize, + + #[cfg(feature = "stats")] + #[command(flatten)] + pub stats: StatsConfig, + + #[arg(short = 'm', long, help = "Dest MAC address")] + pub mac_addr: Option, +} + +#[cfg(feature = "stats")] +#[derive(Debug, Parser)] +pub struct StatsConfig { + #[arg( + short = 's', + long, + default_value_t = 1, + help = "CPU core index for stats thread" + )] + pub cpu: usize, + + #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + pub fps: u64, + + #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] + pub layout: GridLayout, } diff --git a/examples/simplefwd-rs/src/main.rs b/examples/simplefwd-rs/src/main.rs index 8957e58..9ea3956 100644 --- a/examples/simplefwd-rs/src/main.rs +++ b/examples/simplefwd-rs/src/main.rs @@ -10,20 +10,43 @@ use std::{ use clap::Parser; use flash::Socket; +use macaddr::MacAddr6; + +#[cfg(feature = "stats")] +use flash::tui::StatsDashboard; use crate::cli::Cli; -fn socket_thread(mut socket: Socket, run: &Arc) { +fn socket_thread(mut socket: Socket, mac_addr: Option, run: &Arc) { while run.load(Ordering::SeqCst) { if !socket.poll().is_ok_and(|val| val) { continue; } let Ok(descs) = socket.recv() else { - break; + continue; + }; + + let Some(mac_addr) = mac_addr else { + socket.send(descs); + continue; }; - socket.send(descs); + let mut descs_send = Vec::with_capacity(descs.len()); + let mut descs_drop = Vec::with_capacity(descs.len()); + + for desc in descs { + let Ok(pkt) = socket.read_exact::<6>(&desc) else { + descs_drop.push(desc); + continue; + }; + + pkt[0..6].copy_from_slice(mac_addr.as_bytes()); + descs_send.push(desc); + } + + socket.send(descs_send); + socket.drop(descs_drop); } } @@ -49,6 +72,19 @@ fn main() { #[cfg(feature = "tracing")] tracing::info!("Sockets: {sockets:?}"); + #[cfg(feature = "stats")] + let mut tui = match StatsDashboard::new( + sockets.iter().map(Socket::stats), + cli.stats.fps, + cli.stats.layout, + ) { + Ok(t) => t, + Err(err) => { + eprintln!("error creating tui: {err}"); + return; + } + }; + let cores = core_affinity::get_core_ids() .unwrap_or_default() .into_iter() @@ -56,19 +92,32 @@ fn main() { .collect::>(); if cores.is_empty() { - eprintln!("No cores found in range {}-{}", cli.cpu_start, cli.cpu_end); + eprintln!("no cores found in range {}-{}", cli.cpu_start, cli.cpu_end); return; } #[cfg(feature = "tracing")] tracing::debug!("Cores: {:?}", cores); + #[cfg(feature = "stats")] + let Some(stats_core) = core_affinity::get_core_ids() + .unwrap_or_default() + .into_iter() + .find(|core_id| core_id.id == cli.stats.cpu) + else { + eprintln!("no core found for stats thread {}", cli.stats.cpu); + return; + }; + let run = Arc::new(AtomicBool::new(true)); - let r = run.clone(); - if let Err(err) = ctrlc::set_handler(move || { - r.store(false, Ordering::SeqCst); - }) { + #[cfg(not(feature = "stats"))] + if let Err(err) = { + let r = run.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + }) + } { eprintln!("error setting Ctrl-C handler: {err}"); return; } @@ -80,11 +129,26 @@ fn main() { let r = run.clone(); thread::spawn(move || { core_affinity::set_for_current(core_id); - socket_thread(socket, &r); + socket_thread(socket, cli.mac_addr, &r); }) }) .collect::>(); + #[cfg(feature = "stats")] + if let Err(err) = thread::spawn(move || { + core_affinity::set_for_current(stats_core); + if let Err(err) = tui.run() { + eprintln!("error dumping stats: {err}"); + } + }) + .join() + { + eprintln!("error in stats thread: {err:?}"); + } + + #[cfg(feature = "stats")] + run.store(false, Ordering::SeqCst); + for handle in handles { if let Err(err) = handle.join() { eprintln!("error in thread: {err:?}"); diff --git a/lib/flash-rs/Cargo.toml b/lib/flash-rs/Cargo.toml index 7dd2d78..c4fa301 100644 --- a/lib/flash-rs/Cargo.toml +++ b/lib/flash-rs/Cargo.toml @@ -2,16 +2,19 @@ name = "flash" version = "0.1.0" edition = "2024" +rust-version = "1.88" description = "Flash userspace library for AF_XDP network function chaining" -repository = "https://github.com/rickydebojeet/flash" +repository = "https://github.com/networkedsystemsIITB/flash" license = "Apache-2.0" [dependencies] bitflags = "2.9.0" +chrono = { version = "0.4.42", optional = true } clap = { version = "4.5.35", features = ["derive"], optional = true } libc = "0.2.171" libxdp-sys = "0.2.1" quanta = "0.12.5" +ratatui = { version = "0.29.0", optional = true } ringbuffer = { version = "0.15.0", optional = true } thiserror = "2.0.12" tracing = { version = "0.1.41", optional = true } @@ -22,4 +25,5 @@ default = [] clap = ["dep:clap"] pool = ["dep:ringbuffer"] stats = [] +tui = ["stats", "dep:chrono", "dep:ratatui"] tracing = ["dep:tracing"] diff --git a/lib/flash-rs/src/client.rs b/lib/flash-rs/src/client.rs index aa61a84..2bbf1e7 100644 --- a/lib/flash-rs/src/client.rs +++ b/lib/flash-rs/src/client.rs @@ -1,24 +1,25 @@ use std::{net::Ipv4Addr, str::FromStr, sync::Arc}; use crate::{ - FlashError, Socket, config::{BindFlags, FlashConfig, Mode, PollConfig, XskConfig}, + error::FlashResult, fd::Fd, mem::Umem, uds::UdsClient, - xsk::SocketShared, + xsk::{Socket, SocketShared}, }; #[cfg(feature = "stats")] use crate::{config::XdpFlags, stats::Stats}; +#[derive(Debug)] pub struct Route { pub ip_addr: Ipv4Addr, pub next: Vec, } #[allow(clippy::missing_errors_doc, clippy::too_many_lines)] -pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> { +pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { let mut uds_client = UdsClient::new()?; let (umem_fd, total_sockets, umem_size, umem_scale) = @@ -118,34 +119,23 @@ pub fn connect(config: &FlashConfig) -> Result<(Vec, Route), FlashError> let socket_shared = Arc::new(SocketShared::new(xsk_config, poll_config, uds_client)); - #[cfg(feature = "stats")] let sockets = socket_info .into_iter() .enumerate() - .map(|(i, (fd, ifqueue))| { - Socket::new( - fd.clone(), - Umem::new(umem_fd, umem_size)?, - i, - umem_scale, - umem_offset, - Stats::new(fd, ifname.clone(), ifqueue, xdp_flags.clone()), - socket_shared.clone(), - ) - }) - .collect::, _>>()?; + .map(|(i, socket_data)| { + #[cfg(feature = "stats")] + let (fd, ifqueue) = socket_data; + #[cfg(not(feature = "stats"))] + let fd = socket_data; - #[cfg(not(feature = "stats"))] - let sockets = socket_info - .into_iter() - .enumerate() - .map(|(i, fd)| { Socket::new( fd.clone(), Umem::new(umem_fd, umem_size)?, i, umem_scale, umem_offset, + #[cfg(feature = "stats")] + Stats::new(fd, ifname.clone(), ifqueue, xdp_flags.clone()), socket_shared.clone(), ) }) diff --git a/lib/flash-rs/src/config/config_noclap.rs b/lib/flash-rs/src/config/config.rs similarity index 100% rename from lib/flash-rs/src/config/config_noclap.rs rename to lib/flash-rs/src/config/config.rs diff --git a/lib/flash-rs/src/config/mod.rs b/lib/flash-rs/src/config/mod.rs index c834206..f76e30b 100644 --- a/lib/flash-rs/src/config/mod.rs +++ b/lib/flash-rs/src/config/mod.rs @@ -3,22 +3,16 @@ mod error; mod poll; mod xsk; -#[cfg(feature = "clap")] -mod config_clap; +#[cfg_attr(feature = "clap", path = "config_clap.rs")] +#[allow(clippy::module_inception)] +mod config; -#[cfg(not(feature = "clap"))] -mod config_noclap; +pub(crate) use { + poll::PollConfig, + xsk::{BindFlags, Mode, XskConfig}, +}; -pub(crate) use poll::PollConfig; -pub(crate) use xsk::{BindFlags, Mode, XskConfig}; - -pub use error::ConfigError; - -#[cfg(feature = "clap")] -pub use config_clap::FlashConfig; - -#[cfg(not(feature = "clap"))] -pub use config_noclap::FlashConfig; +pub use {config::FlashConfig, error::ConfigError}; #[cfg(feature = "stats")] pub use xsk::XdpFlags; diff --git a/lib/flash-rs/src/error.rs b/lib/flash-rs/src/error.rs index 2a45b51..9d4b926 100644 --- a/lib/flash-rs/src/error.rs +++ b/lib/flash-rs/src/error.rs @@ -2,6 +2,8 @@ use std::{io, net::AddrParseError}; use crate::{config::ConfigError, fd::FdError, uds::UdsError, xsk::SocketError}; +pub(crate) type FlashResult = Result; + #[derive(Debug, thiserror::Error)] #[error("flash error: {0}")] pub enum FlashError { @@ -9,7 +11,7 @@ pub enum FlashError { AddrParse(#[from] AddrParseError), Config(#[from] ConfigError), - UDS(#[from] UdsError), Fd(#[from] FdError), Socket(#[from] SocketError), + UDS(#[from] UdsError), } diff --git a/lib/flash-rs/src/fd/fd.rs b/lib/flash-rs/src/fd/fd.rs index 8815ff9..47d118f 100644 --- a/lib/flash-rs/src/fd/fd.rs +++ b/lib/flash-rs/src/fd/fd.rs @@ -27,15 +27,16 @@ pub(crate) struct Fd { // poll_timeout: i32, } +#[allow(clippy::missing_fields_in_debug)] impl fmt::Debug for Fd { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&self, f) + f.debug_struct("Fd").field("id", &self.id).finish() } } impl Fd { pub(crate) fn new(id: i32) -> Self { - assert!(id >= 0, "Invalid file descriptor: {id}"); + assert!(id >= 0, "fd error: invalid file descriptor: {id}"); Fd { id, @@ -123,7 +124,7 @@ impl Fd { SOL_XDP, XDP_STATISTICS, (&raw mut stats).cast(), - &mut optlen, + &raw mut optlen, ) } != 0 { diff --git a/lib/flash-rs/src/fd/mod.rs b/lib/flash-rs/src/fd/mod.rs index f2b28c1..d2b3046 100644 --- a/lib/flash-rs/src/fd/mod.rs +++ b/lib/flash-rs/src/fd/mod.rs @@ -1,7 +1,9 @@ mod error; -mod fd; mod xdp; +#[allow(clippy::module_inception)] +mod fd; + pub(crate) use fd::Fd; pub use error::FdError; diff --git a/lib/flash-rs/src/lib.rs b/lib/flash-rs/src/lib.rs index c4e3334..4ac2bda 100644 --- a/lib/flash-rs/src/lib.rs +++ b/lib/flash-rs/src/lib.rs @@ -8,7 +8,10 @@ mod util; mod xsk; #[cfg(feature = "stats")] -mod stats; +pub mod stats; + +#[cfg(feature = "tui")] +pub mod tui; pub use crate::{ client::{Route, connect}, @@ -16,6 +19,3 @@ pub use crate::{ error::FlashError, xsk::Socket, }; - -#[cfg(feature = "stats")] -pub use stats::Stats; diff --git a/lib/flash-rs/src/mem/desc.rs b/lib/flash-rs/src/mem/desc.rs index 93ec144..8270869 100644 --- a/lib/flash-rs/src/mem/desc.rs +++ b/lib/flash-rs/src/mem/desc.rs @@ -1,6 +1,6 @@ use libxdp_sys::{xdp_desc, xsk_umem__add_offset_to_addr, xsk_umem__extract_addr}; -use crate::mem::FRAME_SIZE; +use super::FRAME_SIZE; #[derive(Debug)] pub struct Desc { diff --git a/lib/flash-rs/src/mem/pool.rs b/lib/flash-rs/src/mem/pool.rs index 991735a..cbb81ae 100644 --- a/lib/flash-rs/src/mem/pool.rs +++ b/lib/flash-rs/src/mem/pool.rs @@ -8,16 +8,13 @@ pub(crate) struct Pool(AllocRingBuffer); impl Pool { pub(crate) fn new(scale: u32, offset: u64) -> Self { - let frame_size = u64::from(FRAME_SIZE); let nr_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; - let mut ring_buffer = AllocRingBuffer::new(2 * nr_frames as usize); - let mut addr = (offset + u64::from(nr_frames)) * frame_size; - for _ in 0..nr_frames { - ring_buffer.push(addr); - addr += frame_size; - } + let nr_frames = u64::from(nr_frames); + let shift = FRAME_SIZE.trailing_zeros(); + + ring_buffer.extend(((offset + nr_frames)..(offset + 2 * nr_frames)).map(|x| x << shift)); Self(ring_buffer) } @@ -33,7 +30,7 @@ impl Pool { } #[inline] - pub(crate) fn extend(&mut self, iter: impl IntoIterator) { + pub(crate) fn put_batch(&mut self, iter: impl IntoIterator) { self.0.extend(iter); } } diff --git a/lib/flash-rs/src/mem/ring/mod.rs b/lib/flash-rs/src/mem/ring/mod.rs index bf4b51d..dc17667 100644 --- a/lib/flash-rs/src/mem/ring/mod.rs +++ b/lib/flash-rs/src/mem/ring/mod.rs @@ -4,10 +4,7 @@ mod fill; mod rx; mod tx; -pub(crate) use comp::CompRing; -pub(crate) use fill::FillRing; -pub(crate) use rx::RxRing; -pub(crate) use tx::TxRing; +pub(crate) use {comp::CompRing, fill::FillRing, rx::RxRing, tx::TxRing}; pub(crate) trait Prod { fn needs_wakeup(&self) -> bool; diff --git a/lib/flash-rs/src/stats/mod.rs b/lib/flash-rs/src/stats/mod.rs new file mode 100644 index 0000000..a5327a9 --- /dev/null +++ b/lib/flash-rs/src/stats/mod.rs @@ -0,0 +1,5 @@ +#[allow(clippy::module_inception)] +mod stats; +mod sub; + +pub use {stats::Stats, sub::*}; diff --git a/lib/flash-rs/src/stats.rs b/lib/flash-rs/src/stats/stats.rs similarity index 54% rename from lib/flash-rs/src/stats.rs rename to lib/flash-rs/src/stats/stats.rs index cad7b57..b0351ab 100644 --- a/lib/flash-rs/src/stats.rs +++ b/lib/flash-rs/src/stats/stats.rs @@ -5,13 +5,15 @@ use crate::{ fd::{Fd, FdError}, }; +use super::sub::{AppStats, Interface, RingStats, XdpStats}; + #[derive(Debug)] pub struct Stats { fd: Fd, pub interface: Interface, pub xdp_flags: XdpFlags, - pub(super) ring: UnsafeCell, - pub(super) app: UnsafeCell, + pub(crate) ring: UnsafeCell, + pub(crate) app: UnsafeCell, } unsafe impl Send for Stats {} @@ -31,50 +33,20 @@ impl Stats { } } + #[inline] pub fn get_ring_stats(&self) -> RingStats { - unsafe { (*self.ring.get()).clone() } + unsafe { *self.ring.get() } } + #[inline] pub fn get_app_stats(&self) -> AppStats { - unsafe { (*self.app.get()).clone() } + unsafe { *self.app.get() } } + #[inline] #[allow(clippy::missing_errors_doc, clippy::missing_transmute_annotations)] pub fn get_xdp_stats(&self) -> Result { let xdp_stats = self.fd.xdp_statistics()?; Ok(unsafe { mem::transmute::<_, XdpStats>(xdp_stats) }) } } - -#[derive(Debug, Clone)] -pub struct Interface { - pub name: String, - pub queue: u32, -} - -#[derive(Debug, Default, Clone)] -pub struct RingStats { - pub rx: u64, - pub tx: u64, - pub drop: u64, -} - -#[derive(Debug, Default, Clone)] -pub struct AppStats { - pub rx_empty_polls: u64, - pub fill_fail_polls: u64, - pub tx_copy_sendtos: u64, - pub tx_wakeup_sendtos: u64, - pub opt_polls: u64, - pub backpressure: u64, -} - -#[derive(Debug, Default, Clone)] -pub struct XdpStats { - pub rx_dropped: u64, - pub rx_invalid_descs: u64, - pub tx_invalid_descs: u64, - pub rx_ring_full: u64, - pub rx_fill_ring_empty_descs: u64, - pub tx_ring_empty_descs: u64, -} diff --git a/lib/flash-rs/src/stats/sub.rs b/lib/flash-rs/src/stats/sub.rs new file mode 100644 index 0000000..1e0560c --- /dev/null +++ b/lib/flash-rs/src/stats/sub.rs @@ -0,0 +1,32 @@ +#[derive(Debug, Clone)] +pub struct Interface { + pub name: String, + pub queue: u32, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct RingStats { + pub rx: u64, + pub tx: u64, + pub drop: u64, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct AppStats { + pub rx_empty_polls: u64, + pub fill_fail_polls: u64, + pub tx_copy_sendtos: u64, + pub tx_wakeup_sendtos: u64, + pub opt_polls: u64, + pub backpressure: u64, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct XdpStats { + pub rx_dropped: u64, + pub rx_invalid_descs: u64, + pub tx_invalid_descs: u64, + pub rx_ring_full: u64, + pub rx_fill_ring_empty_descs: u64, + pub tx_ring_empty_descs: u64, +} diff --git a/lib/flash-rs/src/tui/dashboard.rs b/lib/flash-rs/src/tui/dashboard.rs new file mode 100644 index 0000000..163bccf --- /dev/null +++ b/lib/flash-rs/src/tui/dashboard.rs @@ -0,0 +1,147 @@ +use std::{ + io, + sync::Arc, + time::{Duration, Instant}, +}; + +use ratatui::{ + Terminal, + crossterm::{ + ExecutableCommand, + event::{self, Event, KeyCode, KeyEvent, KeyEventKind}, + terminal::{self, EnterAlternateScreen, LeaveAlternateScreen}, + }, + prelude::CrosstermBackend, +}; + +use crate::stats::Stats; + +use super::{ + error::{TuiError, TuiResult}, + layout::{GridLayout, LayoutCache}, + panel::StatsPanel, +}; + +#[derive(Debug)] +pub struct StatsDashboard { + frame_interval: Duration, + last_frame_time: Instant, + terminal: Terminal>, + panels: Vec, + layout_cache: LayoutCache, +} + +impl StatsDashboard { + #[allow(clippy::missing_errors_doc)] + pub fn new( + stats: impl Iterator>, + fps: u64, + layout: GridLayout, + ) -> TuiResult { + let panels = stats + .enumerate() + .map(|(i, stat)| StatsPanel::new(i, stat)) + .collect::>(); + + let num_panels = panels.len(); + if num_panels == 0 { + return Err(TuiError::EmptyStats); + } + + let terminal = Terminal::new(CrosstermBackend::new(io::stdout()))?; + + Ok(Self { + frame_interval: Duration::from_micros(1_000_000 / fps.max(1)), + last_frame_time: Instant::now(), + terminal, + panels, + layout_cache: LayoutCache::new(layout, num_panels), + }) + } + + fn resize_panels(&mut self, terminal_size: (u16, u16)) { + if let Some(panel_areas) = self.layout_cache.update_panel_areas(terminal_size) { + for (panel, &area) in self.panels.iter_mut().zip(panel_areas.iter()) { + panel.resize(area); + } + } + } + + fn render(&mut self) -> io::Result<()> { + self.terminal.draw(|frame| { + for panel in &mut self.panels { + panel.render(frame); + } + })?; + + Ok(()) + } + + fn poll_until_next_frame(&mut self) -> io::Result { + let next_frame_time = self.last_frame_time + self.frame_interval; + + loop { + let now = Instant::now(); + if now >= next_frame_time { + break; + } + + if event::poll(next_frame_time - now)? { + match event::read()? { + Event::Key(KeyEvent { + code, + kind: KeyEventKind::Press, + .. + }) => match code { + KeyCode::Char('q' | 'Q') | KeyCode::Esc => return Ok(true), + KeyCode::Char('r' | 'R') => return Ok(false), + _ => {} + }, + Event::Resize(width, height) => { + self.resize_panels((width, height)); + return Ok(false); + } + _ => {} + } + } + } + + Ok(false) + } + + #[allow(clippy::missing_errors_doc)] + pub fn run(&mut self) -> TuiResult<()> { + let _guard = TerminalGuard::new(&mut self.terminal)?; + self.resize_panels(terminal::size()?); + + loop { + self.last_frame_time = Instant::now(); + self.render()?; + + if self.poll_until_next_frame()? { + break; + } + } + + Ok(()) + } +} + +struct TerminalGuard; + +impl TerminalGuard { + fn new(terminal: &mut Terminal>) -> io::Result { + terminal::enable_raw_mode()?; + io::stdout().execute(EnterAlternateScreen)?; + terminal.hide_cursor()?; + Ok(Self) + } +} + +impl Drop for TerminalGuard { + fn drop(&mut self) { + let _ = io::stdout().execute(LeaveAlternateScreen); + let _ = terminal::disable_raw_mode(); + // terminal.show_cursor().ok(); + } +} diff --git a/lib/flash-rs/src/tui/error.rs b/lib/flash-rs/src/tui/error.rs new file mode 100644 index 0000000..6f8ec76 --- /dev/null +++ b/lib/flash-rs/src/tui/error.rs @@ -0,0 +1,12 @@ +use std::io; + +pub(super) type TuiResult = Result; + +#[derive(Debug, thiserror::Error)] +#[error("tui error: {0}")] +pub enum TuiError { + IO(#[from] io::Error), + + #[error("tui error: empty stats")] + EmptyStats, +} diff --git a/lib/flash-rs/src/tui/layout.rs b/lib/flash-rs/src/tui/layout.rs new file mode 100644 index 0000000..5cc62b3 --- /dev/null +++ b/lib/flash-rs/src/tui/layout.rs @@ -0,0 +1,110 @@ +use std::iter; + +use ratatui::layout::{Constraint, Direction, Layout, Rect}; + +#[derive(Debug)] +pub(super) struct LayoutCache { + config: LayoutConfig, + cached_areas: Vec, + last_terminal_size: (u16, u16), + num_panels: usize, +} + +impl LayoutCache { + pub(super) fn new(config: GridLayout, num_panels: usize) -> Self { + Self { + config: config.into(), + cached_areas: Vec::new(), + last_terminal_size: (0, 0), + num_panels, + } + } + + pub(super) fn update_panel_areas(&mut self, terminal_size: (u16, u16)) -> Option<&[Rect]> { + if self.last_terminal_size == terminal_size { + None + } else { + self.recalculate_layout(terminal_size); + Some(&self.cached_areas) + } + } + + fn recalculate_layout(&mut self, size: (u16, u16)) { + let container = Rect { + x: 0, + y: 0, + width: size.0, + height: size.1, + }; + + self.cached_areas = self + .config + .calculate_panel_areas(self.num_panels, container); + self.last_terminal_size = size; + } +} + +#[derive(Debug)] +struct LayoutConfig { + primary_direction: Direction, + panels_per_line: usize, +} + +impl LayoutConfig { + fn calculate_panel_areas(&self, num_panels: usize, container: Rect) -> Vec { + let line_direction = match self.primary_direction { + Direction::Horizontal => Direction::Vertical, + Direction::Vertical => Direction::Horizontal, + }; + + let num_lines = num_panels.div_ceil(self.panels_per_line); + let line_constraints = iter::repeat_n(Constraint::Fill(1), num_lines); + + let lines = Layout::default() + .direction(line_direction) + .constraints(line_constraints) + .split(container); + + let mut panel_areas = Vec::with_capacity(num_panels); + let panel_constraints = iter::repeat_n(Constraint::Fill(1), self.panels_per_line); + + for &line_area in lines.iter() { + let panels_in_line = Layout::default() + .direction(self.primary_direction) + .constraints(panel_constraints.clone()) + .split(line_area); + + panel_areas.extend_from_slice(&panels_in_line); + } + + panel_areas.truncate(num_panels); + panel_areas + } +} + +#[derive(Clone, Copy, Debug)] +pub enum GridLayout { + Rows(usize), + Columns(usize), +} + +impl Default for GridLayout { + fn default() -> Self { + GridLayout::Rows(3) + } +} + +impl From for LayoutConfig { + fn from(layout: GridLayout) -> Self { + match layout { + GridLayout::Rows(n) => LayoutConfig { + primary_direction: Direction::Vertical, + panels_per_line: n.max(1), + }, + GridLayout::Columns(n) => LayoutConfig { + primary_direction: Direction::Horizontal, + panels_per_line: n.max(1), + }, + } + } +} diff --git a/lib/flash-rs/src/tui/layout_str.rs b/lib/flash-rs/src/tui/layout_str.rs new file mode 100644 index 0000000..00dc968 --- /dev/null +++ b/lib/flash-rs/src/tui/layout_str.rs @@ -0,0 +1,50 @@ +use std::{fmt, str::FromStr}; + +use super::layout::GridLayout; + +#[derive(Debug, thiserror::Error)] +#[error( + "error parsing grid layout: '{0}' | expected format: (e.g. '3r' or '4col')" +)] +pub struct GridLayoutParseError(String); + +impl GridLayoutParseError { + pub(super) fn new(msg: impl Into) -> Self { + GridLayoutParseError(msg.into()) + } +} + +impl FromStr for GridLayout { + type Err = GridLayoutParseError; + + fn from_str(s: &str) -> Result { + let s = s.trim(); + let split_pos = match s.find(|c: char| !c.is_ascii_digit()) { + None | Some(0) => { + return Err(GridLayoutParseError::new(s)); + } + Some(pos) => pos, + }; + + let (num_str, suffix) = s.split_at(split_pos); + + let Ok(num) = num_str.parse::() else { + return Err(GridLayoutParseError::new(s)); + }; + + match suffix.trim() { + "r" | "row" | "rows" => Ok(GridLayout::Rows(num)), + "c" | "col" | "cols" | "column" | "columns" => Ok(GridLayout::Columns(num)), + _ => Err(GridLayoutParseError::new(s)), + } + } +} + +impl fmt::Display for GridLayout { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + GridLayout::Rows(n) => write!(f, "{n}row"), + GridLayout::Columns(n) => write!(f, "{n}col"), + } + } +} diff --git a/lib/flash-rs/src/tui/mod.rs b/lib/flash-rs/src/tui/mod.rs new file mode 100644 index 0000000..5c7114f --- /dev/null +++ b/lib/flash-rs/src/tui/mod.rs @@ -0,0 +1,13 @@ +mod dashboard; +mod error; +mod layout; +mod panel; +mod widget; + +#[cfg(feature = "clap")] +mod layout_str; + +pub use {dashboard::StatsDashboard, error::TuiError, layout::GridLayout}; + +#[cfg(feature = "clap")] +pub use layout_str::GridLayoutParseError; diff --git a/lib/flash-rs/src/tui/panel.rs b/lib/flash-rs/src/tui/panel.rs new file mode 100644 index 0000000..e072e71 --- /dev/null +++ b/lib/flash-rs/src/tui/panel.rs @@ -0,0 +1,91 @@ +use std::{sync::Arc, time::Instant}; + +use ratatui::{ + Frame, + layout::{Constraint, Layout, Rect}, + widgets::Block, +}; + +use crate::stats::{AppStats, RingStats, Stats, XdpStats}; + +#[derive(Debug)] +pub(super) struct StatsPanel { + view: StatsView, + area: Rect, + block: Block<'static>, + inner_area: Rect, +} + +impl StatsPanel { + pub(super) fn new(index: usize, stats: Arc) -> Self { + Self { + view: StatsView::new(stats), + area: Rect::default(), + block: Block::bordered().title(format!(" Socket {} Stats ", index + 1)), + inner_area: Rect::default(), + } + } + + pub(super) fn resize(&mut self, area: Rect) { + self.area = area; + self.inner_area = self.block.inner(area); + } + + pub(super) fn render(&mut self, frame: &mut Frame<'_>) { + frame.render_widget(self.block.clone(), self.area); + self.view.render(frame, self.inner_area); + } +} + +#[derive(Debug)] +pub struct StatsView { + stats: Arc, + last_timestamp: Instant, + + ring_stats: RingStats, + app_stats: AppStats, + xdp_stats: XdpStats, + + v_layout: Layout, + h_layout: Layout, +} + +impl StatsView { + pub fn new(stats: Arc) -> Self { + Self { + stats, + last_timestamp: Instant::now(), + + ring_stats: RingStats::default(), + app_stats: AppStats::default(), + xdp_stats: XdpStats::default(), + + v_layout: Layout::vertical(Constraint::from_fills([4, 7])), + h_layout: Layout::horizontal(Constraint::from_fills([1; 2])).spacing(2), + } + } + + #[allow(clippy::cast_possible_truncation)] + pub fn render(&mut self, frame: &mut Frame<'_>, area: Rect) { + let [v1, v2] = self.v_layout.areas(area); + let [h1_1, h1_2] = self.h_layout.areas(v1); + let [h2_1, h2_2] = self.h_layout.areas(v2); + + let ring_stats = self.stats.get_ring_stats(); + let app_stats = self.stats.get_app_stats(); + let xdp_stats = self.stats.get_xdp_stats().unwrap_or(self.xdp_stats); + + let now = Instant::now(); + let diff = now.duration_since(self.last_timestamp).as_nanos() as u64; + + self.stats.render(frame, h1_1); + ring_stats.render(&self.ring_stats, diff, frame, h1_2); + app_stats.render(&self.app_stats, diff, frame, h2_1); + xdp_stats.render(&self.xdp_stats, diff, frame, h2_2); + + self.ring_stats = ring_stats; + self.app_stats = app_stats; + self.xdp_stats = xdp_stats; + self.last_timestamp = now; + } +} diff --git a/lib/flash-rs/src/tui/widget/app.rs b/lib/flash-rs/src/tui/widget/app.rs new file mode 100644 index 0000000..d41b0bc --- /dev/null +++ b/lib/flash-rs/src/tui/widget/app.rs @@ -0,0 +1,90 @@ +use ratatui::{ + Frame, + layout::{Constraint, Rect}, + prelude::Alignment, + style::{Color, Style}, + text::ToText as _, + widgets::{Row, Table}, +}; + +use crate::stats::AppStats; + +use super::max_len; + +const HEADERS: [&str; 6] = [ + "rx empty polls", + "fill fail polls", + "tx copy sendtos", + "tx wakeup sendtos", + "opt polls", + "backpressure", +]; +const MAX_HEADER_LEN: u16 = max_len!(HEADERS); + +impl AppStats { + pub(crate) fn render(&self, old_stats: &Self, diff: u64, frame: &mut Frame<'_>, area: Rect) { + let rx_empty_polls_cps = + ((self.rx_empty_polls - old_stats.rx_empty_polls) * 1_000_000_000) / diff; + let fill_fail_polls_cps = + ((self.fill_fail_polls - old_stats.fill_fail_polls) * 1_000_000_000) / diff; + let tx_copy_sendtos_cps = + ((self.tx_copy_sendtos - old_stats.tx_copy_sendtos) * 1_000_000_000) / diff; + let tx_wakeup_sendtos_cps = + ((self.tx_wakeup_sendtos - old_stats.tx_wakeup_sendtos) * 1_000_000_000) / diff; + let opt_polls_cps = ((self.opt_polls - old_stats.opt_polls) * 1_000_000_000) / diff; + let backpressure_cps = + ((self.backpressure - old_stats.backpressure) * 1_000_000_000) / diff; + + let rows = [ + Row::new([ + HEADERS[0].to_text(), + rx_empty_polls_cps.to_text().alignment(Alignment::Right), + self.rx_empty_polls.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[1].to_text(), + fill_fail_polls_cps.to_text().alignment(Alignment::Right), + self.fill_fail_polls.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[2].to_text(), + tx_copy_sendtos_cps.to_text().alignment(Alignment::Right), + self.tx_copy_sendtos.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[3].to_text(), + tx_wakeup_sendtos_cps.to_text().alignment(Alignment::Right), + self.tx_wakeup_sendtos.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[4].to_text(), + opt_polls_cps.to_text().alignment(Alignment::Right), + self.opt_polls.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[5].to_text(), + backpressure_cps.to_text().alignment(Alignment::Right), + self.backpressure.to_text().alignment(Alignment::Right), + ]), + ]; + + let table = Table::new( + rows, + [ + Constraint::Length(MAX_HEADER_LEN), + Constraint::Fill(1), + Constraint::Fill(1), + ], + ) + .header( + Row::new([ + "App".to_text(), + "calls/s".to_text().alignment(Alignment::Right), + "count".to_text().alignment(Alignment::Right), + ]) + .style(Style::default().fg(Color::Yellow)), + ); + + frame.render_widget(table, area); + } +} diff --git a/lib/flash-rs/src/tui/widget/meta.rs b/lib/flash-rs/src/tui/widget/meta.rs new file mode 100644 index 0000000..e182be9 --- /dev/null +++ b/lib/flash-rs/src/tui/widget/meta.rs @@ -0,0 +1,36 @@ +use chrono::Local; +use ratatui::{ + Frame, + layout::{Constraint, Rect}, + text::ToText as _, + widgets::{Row, Table}, +}; + +use crate::stats::Stats; + +use super::max_len; + +const HEADERS: [&str; 3] = ["interface", "xdp flags", "timestamp"]; +const MAX_HEADER_LEN: u16 = max_len!(HEADERS); + +impl Stats { + pub(crate) fn render(&self, frame: &mut Frame<'_>, area: Rect) { + let interface = format!("{}:{}", self.interface.name, self.interface.queue); + let xdp_flags = format!("{:?}", self.xdp_flags); + let tstamp = Local::now().format("%H:%M:%S%.3f"); + + let rows = [ + Row::new([HEADERS[0].to_text(), interface.to_text()]), + Row::new([HEADERS[1].to_text(), xdp_flags.to_text()]), + Row::new([HEADERS[2].to_text(), tstamp.to_text()]), + ]; + + let table = Table::new( + rows, + [Constraint::Length(MAX_HEADER_LEN), Constraint::Fill(1)], + ) + .header(Row::new([""; 0])); + + frame.render_widget(table, area); + } +} diff --git a/lib/flash-rs/src/tui/widget/mod.rs b/lib/flash-rs/src/tui/widget/mod.rs new file mode 100644 index 0000000..a8787aa --- /dev/null +++ b/lib/flash-rs/src/tui/widget/mod.rs @@ -0,0 +1,25 @@ +mod app; +mod meta; +mod ring; +mod xdp; + +macro_rules! max_len { + ($arr:expr) => {{ + #[allow(clippy::cast_possible_truncation)] + { + let arr = $arr; + let (mut max, mut i) = (0, 0); + + while i < arr.len() { + if arr[i].len() > max { + max = arr[i].len(); + } + i += 1; + } + + max as u16 + } + }}; +} + +pub(super) use max_len; diff --git a/lib/flash-rs/src/tui/widget/ring.rs b/lib/flash-rs/src/tui/widget/ring.rs new file mode 100644 index 0000000..8c0ee6a --- /dev/null +++ b/lib/flash-rs/src/tui/widget/ring.rs @@ -0,0 +1,60 @@ +use ratatui::{ + Frame, + layout::{Constraint, Rect}, + prelude::Alignment, + style::{Color, Style}, + text::ToText as _, + widgets::{Row, Table}, +}; + +use crate::stats::RingStats; + +use super::max_len; + +const HEADERS: [&str; 3] = ["rx", "tx", "drop"]; +const MAX_HEADER_LEN: u16 = max_len!(HEADERS); + +impl RingStats { + pub(crate) fn render(&self, old_stats: &Self, diff: u64, frame: &mut Frame<'_>, area: Rect) { + let rx_pps = ((self.rx - old_stats.rx) * 1_000_000_000) / diff; + let tx_pps = ((self.tx - old_stats.tx) * 1_000_000_000) / diff; + let drop_pps = ((self.drop - old_stats.drop) * 1_000_000_000) / diff; + + let rows = [ + Row::new([ + HEADERS[0].to_text(), + rx_pps.to_text().alignment(Alignment::Right), + self.rx.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[1].to_text(), + tx_pps.to_text().alignment(Alignment::Right), + self.tx.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[2].to_text(), + drop_pps.to_text().alignment(Alignment::Right), + self.drop.to_text().alignment(Alignment::Right), + ]), + ]; + + let table = Table::new( + rows, + [ + Constraint::Length(MAX_HEADER_LEN), + Constraint::Fill(1), + Constraint::Fill(1), + ], + ) + .header( + Row::new([ + "Ring".to_text(), + "pps".to_text().alignment(Alignment::Right), + "pkts".to_text().alignment(Alignment::Right), + ]) + .style(Style::default().fg(Color::Yellow)), + ); + + frame.render_widget(table, area); + } +} diff --git a/lib/flash-rs/src/tui/widget/xdp.rs b/lib/flash-rs/src/tui/widget/xdp.rs new file mode 100644 index 0000000..fab806c --- /dev/null +++ b/lib/flash-rs/src/tui/widget/xdp.rs @@ -0,0 +1,95 @@ +use ratatui::{ + Frame, + layout::{Constraint, Rect}, + prelude::Alignment, + style::{Color, Style}, + text::ToText as _, + widgets::{Row, Table}, +}; + +use crate::stats::XdpStats; + +use super::max_len; + +const HEADERS: [&str; 6] = [ + "rx dropped", + "rx invalid", + "tx invalid", + "rx ring full", + "rx fill ring empty", + "tx ring empty", +]; +const MAX_HEADER_LEN: u16 = max_len!(HEADERS); + +impl XdpStats { + pub(crate) fn render(&self, old_stats: &Self, diff: u64, frame: &mut Frame<'_>, area: Rect) { + let rx_dropped_pps = ((self.rx_dropped - old_stats.rx_dropped) * 1_000_000_000) / diff; + let rx_invalid_pps = + ((self.rx_invalid_descs - old_stats.rx_invalid_descs) * 1_000_000_000) / diff; + let tx_invalid_pps = + ((self.tx_invalid_descs - old_stats.tx_invalid_descs) * 1_000_000_000) / diff; + let rx_ring_full_pps = + ((self.rx_ring_full - old_stats.rx_ring_full) * 1_000_000_000) / diff; + let rx_fill_ring_empty_pps = + ((self.rx_fill_ring_empty_descs - old_stats.rx_fill_ring_empty_descs) * 1_000_000_000) + / diff; + let tx_ring_empty_pps = + ((self.tx_ring_empty_descs - old_stats.tx_ring_empty_descs) * 1_000_000_000) / diff; + + let rows = [ + Row::new([ + HEADERS[0].to_text(), + rx_dropped_pps.to_text().alignment(Alignment::Right), + self.rx_dropped.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[1].to_text(), + rx_invalid_pps.to_text().alignment(Alignment::Right), + self.rx_invalid_descs.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[2].to_text(), + tx_invalid_pps.to_text().alignment(Alignment::Right), + self.tx_invalid_descs.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[3].to_text(), + rx_ring_full_pps.to_text().alignment(Alignment::Right), + self.rx_ring_full.to_text().alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[4].to_text(), + rx_fill_ring_empty_pps.to_text().alignment(Alignment::Right), + self.rx_fill_ring_empty_descs + .to_text() + .alignment(Alignment::Right), + ]), + Row::new([ + HEADERS[5].to_text(), + tx_ring_empty_pps.to_text().alignment(Alignment::Right), + self.tx_ring_empty_descs + .to_text() + .alignment(Alignment::Right), + ]), + ]; + + let table = Table::new( + rows, + [ + Constraint::Length(MAX_HEADER_LEN), + Constraint::Fill(1), + Constraint::Fill(1), + ], + ) + .header( + Row::new([ + "XDP".to_text(), + "pps".to_text().alignment(Alignment::Right), + "pkts".to_text().alignment(Alignment::Right), + ]) + .style(Style::default().fg(Color::Yellow)), + ); + + frame.render_widget(table, area); + } +} diff --git a/lib/flash-rs/src/xsk/mod.rs b/lib/flash-rs/src/xsk/mod.rs index 04f9e9b..b22fee0 100644 --- a/lib/flash-rs/src/xsk/mod.rs +++ b/lib/flash-rs/src/xsk/mod.rs @@ -4,5 +4,4 @@ mod socket; pub(crate) use shared::SocketShared; -pub use error::SocketError; -pub use socket::Socket; +pub use {error::SocketError, socket::Socket}; diff --git a/lib/flash-rs/src/xsk/socket.rs b/lib/flash-rs/src/xsk/socket.rs index d332c02..8da1b48 100644 --- a/lib/flash-rs/src/xsk/socket.rs +++ b/lib/flash-rs/src/xsk/socket.rs @@ -1,4 +1,4 @@ -use std::{io, sync::Arc, thread}; +use std::{sync::Arc, thread}; use quanta::{Clock, Instant}; @@ -88,13 +88,13 @@ impl Socket { #[allow(clippy::missing_errors_doc)] #[inline] - pub fn poll(&mut self) -> io::Result { + pub fn poll(&mut self) -> SocketResult { if self.shared.xsk_config.mode.contains(Mode::FLASH_POLL) { #[cfg(feature = "stats")] unsafe { (*self.stats.app.get()).opt_polls += 1; } - self.fd.poll() + Ok(self.fd.poll()?) } else { Ok(true) } @@ -152,10 +152,10 @@ impl Socket { } for _ in 0..completed { - if let Some(fill_addr) = self.fill.addr(idx_fq) { - if let Some(comp_addr) = self.comp.addr(idx_cq) { - *fill_addr = *comp_addr; - } + if let Some(fill_addr) = self.fill.addr(idx_fq) + && let Some(comp_addr) = self.comp.addr(idx_cq) + { + *fill_addr = *comp_addr; } idx_fq += 1; @@ -211,14 +211,15 @@ impl Socket { return idx_tx; } - if let Some(poll_config) = &self.shared.poll_config { - if self.outstanding_tx >= poll_config.bp_threshold { - thread::sleep(poll_config.bp_timeout); + if let Some(poll_config) = &self.shared.poll_config + && self.outstanding_tx >= poll_config.bp_threshold + { + thread::sleep(poll_config.bp_timeout); + self.idle_timestamp = None; - #[cfg(feature = "stats")] - unsafe { - (*self.stats.app.get()).backpressure += 1; - } + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).backpressure += 1; } } } @@ -265,14 +266,13 @@ impl Socket { pub fn recv(&mut self) -> SocketResult> { self.complete_tx_rx(); - if let Some(poll_config) = &self.shared.poll_config { - if let Some(idle_timestamp) = self.idle_timestamp { - if self.clock.now() >= idle_timestamp && !self.fd.poll()? { - self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); - - return Ok(vec![]); - } - } + if let Some(poll_config) = &self.shared.poll_config + && let Some(idle_timestamp) = self.idle_timestamp + && self.clock.now() >= idle_timestamp + && !self.fd.poll()? + { + self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); + return Ok(vec![]); } let mut idx_rx = 0; @@ -289,19 +289,19 @@ impl Socket { self.fd.wakeup(); } - if let Some(poll_config) = &self.shared.poll_config { - if self.idle_timestamp.is_none() { - self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); - } + if let Some(poll_config) = &self.shared.poll_config + && self.idle_timestamp.is_none() + { + self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); } return Ok(vec![]); } - if let Some(poll_config) = &self.shared.poll_config { - if rcvd >= poll_config.idle_threshold || self.outstanding_tx > 0 { - self.idle_timestamp = None; - } + if let Some(poll_config) = &self.shared.poll_config + && (rcvd >= poll_config.idle_threshold || self.outstanding_tx > 0) + { + self.idle_timestamp = None; } #[cfg(feature = "tracing")] @@ -394,7 +394,8 @@ impl Socket { } #[cfg(feature = "pool")] - self.pool.extend(descs.into_iter().map(Desc::extract_addr)); + self.pool + .put_batch(descs.into_iter().map(Desc::extract_addr)); #[cfg(not(feature = "pool"))] { From 98e172ea9f94e58a8dfaa8755a676baa0744d59e Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 16:55:23 +0530 Subject: [PATCH 34/43] chore: cleaned config files --- config/{config-2.json => chain-config.json} | 2 +- config/{config.json => complex-config.json} | 22 ++-- config/config-3.json | 57 --------- config/config-4.json | 71 ----------- config/config-5.json | 85 ------------- config/config-6.json | 99 --------------- config/config-7.json | 113 ----------------- config/config-8.json | 124 ------------------- config/config-rr1.json | 56 --------- config/config-rr2.json | 72 ----------- config/{config-1.json => simple-config.json} | 2 +- 11 files changed, 10 insertions(+), 693 deletions(-) rename config/{config-2.json => chain-config.json} (96%) rename config/{config.json => complex-config.json} (76%) delete mode 100644 config/config-3.json delete mode 100644 config/config-4.json delete mode 100644 config/config-5.json delete mode 100644 config/config-6.json delete mode 100644 config/config-7.json delete mode 100644 config/config-8.json delete mode 100644 config/config-rr1.json delete mode 100644 config/config-rr2.json rename config/{config-1.json => simple-config.json} (94%) diff --git a/config/config-2.json b/config/chain-config.json similarity index 96% rename from config/config-2.json rename to config/chain-config.json index e5f621e..c4db979 100644 --- a/config/config-2.json +++ b/config/chain-config.json @@ -26,7 +26,7 @@ ] } ], - "ifname": "ens23f0np0", + "ifname": "enp1s0", "xdp_flags": "d", "bind_flags": "z", "mode": "b", diff --git a/config/config.json b/config/complex-config.json similarity index 76% rename from config/config.json rename to config/complex-config.json index c05f27e..5ca1621 100644 --- a/config/config.json +++ b/config/complex-config.json @@ -11,23 +11,16 @@ { "thread_id": 0, "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ + }, { - "thread_id": 0, + "thread_id": 1, "queue": 1 } ] }, { - "nf_id": 2, - "nf_ip": "192.168.0.3", + "nf_id": 1, + "nf_ip": "192.168.0.2", "nf_port": 1234, "thread": [ { @@ -37,7 +30,7 @@ ] } ], - "ifname": "ens23f0np0", + "ifname": "enp1s0", "xdp_flags": "d", "bind_flags": "z", "mode": "b", @@ -48,10 +41,11 @@ ], "route": { "0": [ - 1, 2 ], - "1": [], + "1": [ + 2 + ], "2": [] } } \ No newline at end of file diff --git a/config/config-3.json b/config/config-3.json deleted file mode 100644 index f81b112..0000000 --- a/config/config-3.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [] - } -} \ No newline at end of file diff --git a/config/config-4.json b/config/config-4.json deleted file mode 100644 index dc9d105..0000000 --- a/config/config-4.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [ - 3 - ], - "3": [] - } -} \ No newline at end of file diff --git a/config/config-5.json b/config/config-5.json deleted file mode 100644 index 632b7f7..0000000 --- a/config/config-5.json +++ /dev/null @@ -1,85 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - }, - { - "nf_id": 4, - "nf_ip": "192.168.0.5", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 4 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [ - 3 - ], - "3": [ - 4 - ], - "4": [] - } -} \ No newline at end of file diff --git a/config/config-6.json b/config/config-6.json deleted file mode 100644 index aa87318..0000000 --- a/config/config-6.json +++ /dev/null @@ -1,99 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - }, - { - "nf_id": 4, - "nf_ip": "192.168.0.5", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 4 - } - ] - }, - { - "nf_id": 5, - "nf_ip": "192.168.0.6", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 5 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [ - 3 - ], - "3": [ - 4 - ], - "4": [ - 5 - ], - "5": [] - } -} \ No newline at end of file diff --git a/config/config-7.json b/config/config-7.json deleted file mode 100644 index f32532a..0000000 --- a/config/config-7.json +++ /dev/null @@ -1,113 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - }, - { - "nf_id": 4, - "nf_ip": "192.168.0.5", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 4 - } - ] - }, - { - "nf_id": 5, - "nf_ip": "192.168.0.6", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 5 - } - ] - }, - { - "nf_id": 6, - "nf_ip": "192.168.0.7", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 6 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [ - 3 - ], - "3": [ - 4 - ], - "4": [ - 5 - ], - "5": [ - 6 - ], - "6": [] - } -} \ No newline at end of file diff --git a/config/config-8.json b/config/config-8.json deleted file mode 100644 index 2b8494d..0000000 --- a/config/config-8.json +++ /dev/null @@ -1,124 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - }, - { - "nf_id": 4, - "nf_ip": "192.168.0.5", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 4 - } - ] - }, - { - "nf_id": 5, - "nf_ip": "192.168.0.6", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 5 - } - ] - }, - { - "nf_id": 6, - "nf_ip": "192.168.0.7", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 6 - } - ] - }, - { - "nf_id": 7, - "nf_ip": "192.168.0.8", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 7 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1 - ], - "1": [ - 2 - ], - "2": [ - 3 - ], - "3": [ - 4 - ], - "4": [ - 5 - ], - "5": [ - 6 - ], - "6": [] - } -} \ No newline at end of file diff --git a/config/config-rr1.json b/config/config-rr1.json deleted file mode 100644 index 0e60e69..0000000 --- a/config/config-rr1.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1, - 2 - ], - "1": [], - "2": [] - } -} \ No newline at end of file diff --git a/config/config-rr2.json b/config/config-rr2.json deleted file mode 100644 index 04768cf..0000000 --- a/config/config-rr2.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "umem": [ - { - "umem_id": 0, - "nf": [ - { - "nf_id": 0, - "nf_ip": "192.168.0.1", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 0 - } - ] - }, - { - "nf_id": 1, - "nf_ip": "192.168.0.2", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 1 - } - ] - }, - { - "nf_id": 2, - "nf_ip": "192.168.0.3", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 2 - } - ] - }, - { - "nf_id": 3, - "nf_ip": "192.168.0.4", - "nf_port": 1234, - "thread": [ - { - "thread_id": 0, - "queue": 3 - } - ] - } - ], - "ifname": "ens23f0np0", - "xdp_flags": "d", - "bind_flags": "z", - "mode": "b", - "custom_xsk": false, - "frags_enabled": false - } - ], - "route": { - "0": [ - 1, - 2 - ], - "1": [ - 3 - ], - "2": [ - 3 - ], - "3": [] - } -} \ No newline at end of file diff --git a/config/config-1.json b/config/simple-config.json similarity index 94% rename from config/config-1.json rename to config/simple-config.json index 01f0f53..0730606 100644 --- a/config/config-1.json +++ b/config/simple-config.json @@ -15,7 +15,7 @@ ] } ], - "ifname": "ens23f0np0", + "ifname": "enp1s0", "xdp_flags": "d", "bind_flags": "z", "mode": "b", From f3efb68a4c876a3ae96051ec8b19bc42f1e512f7 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 23:18:18 +0530 Subject: [PATCH 35/43] docs: updated usage instructions --- README.md | 89 ++++++++++++++++++++++++++++---- doc/flash_kernel/flash_kernel.md | 6 +-- 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c9adb46..ce2ef5c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ FLASH is a high-speed userspace library that makes it easy to build efficient, unprivileged AF_XDP applications for modern cloud and edge deployments. -Seamlessly integrated with the **FLASH kernel**, it extends AF_XDP to enable true zero-copy packet sharing between network functions (NFs) and network devices, unlocking performance that surpasses traditional AF_XDP chaining solutions. +Seamlessly integrated with the [**FLASH kernel**](https://github.com/networkedsystemsIITB/flash-linux), it extends AF_XDP to enable true zero-copy packet sharing between network functions (NFs) and network devices, unlocking performance that surpasses traditional AF_XDP chaining solutions. ## Key Features - **Zero-Copy Packet Sharing**: Unlock unparalleled throughput and minimal latency with zero-copy data paths between NFs and network devices. @@ -54,7 +54,7 @@ sudo apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib ```bash git clone https://github.com/xdp-project/xdp-tools.git -make -j -C xdp-tools libxdp +make PREFIX=/usr -j -C xdp-tools libxdp sudo PREFIX=/usr make -j -C xdp-tools libxdp_install ``` @@ -70,22 +70,93 @@ Once dependencies are ready, build the library and examples: make ``` -### Usage +### Basic Usage -The library offers a straightforward API for constructing and executing AF_XDP. Applications may utilize the library to construct and operate AF_XDP sockets either through the FLASH monitor, which manages the control plane, or directly via the library. The monitor enables applications to execute multiple AF_XDP applications simultaneously, whether sharing memory in privileged or non-privileged modes. +FLASH provides two primary userspace components: +1. **NF Libraries**: Used to build AF_XDP applications with FLASH support (available in C and Rust). +2. **Monitor**: A control-plane application that manages AF_XDP socket configurations and enables unprivileged NF operation. -#### Using Monitor +#### Run a Sample L2FWD NF (Switch) + +You can test a sample NF on any Linux kernel (no FLASH kernel required): ```bash sudo ./build/monitor/monitor ``` +A TUI will start, allowing you to configure AF_XDP. +Configurations are stored as JSON and can be loaded/unloaded on demand. + +Load a sample configuration from [`config/simple_config.json`](./config/simple-config.json), updating interface names as needed: + +```console +flash:/> load config config/simple_config.json +``` -A TUI will be initiated, allowing configuration parameters to be passed to setup AF_XDP setups and chains. Configurations will be stored in a JSON file and loaded/unloaded on demand. Once the monitor has started and the configuration is properly set, NFs can commence running without the need for any privileges. A sample NF usage instruction is provided below. +Then run the L2FWD example (no root needed): ```bash -./build/examples/l2fwd/l2fwd -u 0 -f 1 -ax -- -s 0 -c 2 -e 3 +./build/examples/l2fwd/l2fwd -u 0 -f 0 # C based NF +# or +./build/rust-target/release/l2fwd -u 0 -f 0 # Rust based NF +``` + +`-u 0` → UMEM ID +`-f 0` → NF ID +Both values are defined in the monitor configuration. + + +### Chaining AF_XDP Applications with FLASH + +FLASH allows chaining multiple AF_XDP-based network functions (NFs) together including independent NFs written in different languages using either copy-based or zero-copy modes. + +- **Copy-Based Chaining (Legacy Compatible):** +Works with any existing AF_XDP applications. +Requires no code changes. +FLASH-based NFs can also operate in this mode. +Refer to the sysfs usage in [FLASH Kernel Guide](./doc/flash_kernel/flash_kernel.md) for copy-based setup instructions. + +- **Zero-Copy Chaining:** +Achieved when multiple NFs share the same UMEM region. +Automatically handled by FLASH-based NFs. + +#### Example: Linear Chaining Between Two AF_XDP Applications + +Let’s consider chaining two independent L2 forwarders, one written in C and another in Rust. They only share the same UMEM region for zero-copy operation. + +We can use [`config/chain-config.json`](./config/chain-config.json) as a starting point. + +a. Start the monitor and load the configuration: + +```console +sudo ./build/monitor/monitor +flash:/> load config config/chain-config.json ``` +b. Start the first l2fwd application (C based): + +```bash +./build/examples/l2fwd/l2fwd -u 0 -f 0 -- -s 0 -c 1 -e 1 +``` + +c. Start the second l2fwd application (Rust based): + +```bash +./build/rust-target/release/l2fwd -u 0 -f 1 -s 1 -c 2 -e 2 +``` + +d. Chain them using the monitor TUI: + +```console +flash:/> load route +``` +The configuration file defines routes from `flash_id 0` → `flash_id 1`. +Upon loading, the monitor programs the FLASH kernel with these redirection rules. + +You can then send packets to the first NF and observe them forwarded to the second at high throughput. + +To extend chaining, add more NFs to the configuration file and specify their connections accordingly. + + ## Docker Usage Instructions Docker containers enable the consistent and isolated deployment of NFs in a portable development environment, facilitating the entry of NF developers into the setup process. To begin, you can create an image of the FLASH container. @@ -107,7 +178,7 @@ docker run -rm -it --privileged -v /tmp/flash/:/tmp/flash/ --net=host flash:mon If the monitor is ready and running, the NF can be initiated using the following command: ```bash -docker run --rm -it -v /tmp/flash/:/tmp/flash/ flash:dev ./build/examples/l2fwd/l2fwd -u 0 -f 1 -ax -- -s 0 -c 2 -e 3 +docker run --rm -it -v /tmp/flash/:/tmp/flash/ flash:dev ./build/examples/l2fwd/l2fwd -u 0 -f 1 ``` > `/tmp/flash` contains a UDS socket that is used by NFs to communicate with the monitor. @@ -116,4 +187,4 @@ You can also use docker compose to deploy multiple NFs at the same time. ```bash docker compose up -d -``` +``` \ No newline at end of file diff --git a/doc/flash_kernel/flash_kernel.md b/doc/flash_kernel/flash_kernel.md index 0d44556..5dc3300 100644 --- a/doc/flash_kernel/flash_kernel.md +++ b/doc/flash_kernel/flash_kernel.md @@ -120,7 +120,7 @@ uname -r FLASH kernel exposes a sysfs interface under `/sys/kernel/flash` allowing privileged users to: - Inspect active AF_XDP sockets -- CConfigure redirection rules between sockets +- Configure redirection rules between sockets - Adjust per-socket parameters (e.g., TX tracking) Each AF_XDP socket is identified by a process-independent identifier called a flash-id, which enables cross-process management. @@ -217,7 +217,7 @@ FLASH ensures that `poll()` correctly reflects packet readiness even when redire Backpressure arises when downstream sockets (receivers) cannot process packets as fast as they are being produced. The FLASH Kernel introduces natural backpressure into the AF_XDP data path via the TX and CQ rings: packets are not transmitted to downstream sockets if their RX rings are full. -In this scenario, the sender must retry transmissions untill space becomes available. To avoid wasting CPU cycles through busy-waiting, applications should rely on the following readiness mechanism: +In this scenario, the sender must retry transmissions until space becomes available. To avoid wasting CPU cycles through busy-waiting, applications should rely on the following readiness mechanism: - When congestion is detected on a sender socket, use `poll()` with the `POLLOUT` flag to sleep until the socket is ready to send again. - Receiver sockets should use `recvfrom()` with the `MSG_MORE` flag to implicitly signal the sender once they have freed space. @@ -234,7 +234,7 @@ echo 0 | sudo tee /sys/kernel/flash/tx_tracking # Disable TX tracking ``` When enabled, FLASH tracks packet transmission status on a per-flow basis. -As packets are transmitted succesfully, the kernel writes back the `flash_id` of the downstream socket into the memory location specified by the packet descriptor before returning it to the completion queue. This allows user-space applications to identify which downstream path successfully transmitted each packet. +As packets are transmitted successfully, the kernel writes back the `flash_id` of the downstream socket into the memory location specified by the packet descriptor before returning it to the completion queue. This allows user-space applications to identify which downstream path successfully transmitted each packet. Applications can use this feedback to implement: - Dynamic congestion control or flow rerouting, From 3684002de119198fa1a415026022634c91781a5e Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 23:34:34 +0530 Subject: [PATCH 36/43] feat: meson builds rust dashboard by default --- examples/meson.build | 5 +++++ meson_options.txt | 3 +++ 2 files changed, 8 insertions(+) diff --git a/examples/meson.build b/examples/meson.build index 263b790..39055a2 100644 --- a/examples/meson.build +++ b/examples/meson.build @@ -45,6 +45,11 @@ if get_option('enable_rust') and cargo.found() if get_option('buildtype') == 'debug' or get_option('buildtype') == 'debugoptimized' cargo_build_args += ['-F', 'tracing'] endif + + if get_option('enable_rust_stats') + cargo_build_args += ['-F', 'stats'] + message('Rust: Enabling stats dashboard feature') + endif rust_build = custom_target( 'rust_workspace', diff --git a/meson_options.txt b/meson_options.txt index 2030945..ded7c68 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -7,5 +7,8 @@ option('log_use_color', type: 'boolean', value: true, option('enable_rust', type: 'boolean', value: true, description: 'Enable building Rust applications and libraries') +option('enable_rust_stats', type: 'boolean', value: true, + description: 'Enable building Rust applications with stats dashboard') + option('enable_mtcp', type: 'boolean', value: true, description: 'Enable building mtcp libraries and examples') From 6374c07546481ab2f1d4975ca2e1a7827fd70113 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 23:37:37 +0530 Subject: [PATCH 37/43] chore: removed unused old API --- lib/flash/nf/flash_nf.h | 5 - lib/flash/nf/flash_txrx.c | 232 ++------------------------------ lib/flash/params/flash_params.c | 4 +- 3 files changed, 11 insertions(+), 230 deletions(-) diff --git a/lib/flash/nf/flash_nf.h b/lib/flash/nf/flash_nf.h index 5ea4c83..7f6dd05 100644 --- a/lib/flash/nf/flash_nf.h +++ b/lib/flash/nf/flash_nf.h @@ -147,11 +147,6 @@ size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk */ size_t flash__allocmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nalloc); -int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout); -size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg); -size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend); -size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop); - /* Helper APIs */ /** diff --git a/lib/flash/nf/flash_txrx.c b/lib/flash/nf/flash_txrx.c index 793bd8e..6c380a6 100644 --- a/lib/flash/nf/flash_txrx.c +++ b/lib/flash/nf/flash_txrx.c @@ -57,7 +57,7 @@ static inline uint64_t rdtsc_precise(void) return rdtsc(); } -static uint64_t get_tsc_freq(struct config *cfg) +static inline uint64_t get_tsc_freq(struct config *cfg) { #define NS_PER_SEC 1E9 @@ -82,7 +82,7 @@ static uint64_t get_tsc_freq(struct config *cfg) } #endif -static uint64_t get_timer_hz(struct config *cfg) +static inline uint64_t get_timer_hz(struct config *cfg) { if (__hz == 0) __hz = get_tsc_freq(cfg); @@ -90,7 +90,7 @@ static uint64_t get_timer_hz(struct config *cfg) return __hz; } -int flash__oldpoll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout) +static inline int __poll(struct socket *xsk, struct pollfd *fds, nfds_t nfds, int timeout) { #ifdef STATS xsk->app_stats.opt_polls++; @@ -109,7 +109,7 @@ int flash__poll(struct config *cfg, struct socket *xsk, struct pollfd *fds, nfds return poll(fds, nfds, cfg->xsk->poll_timeout); } -static void __kick_tx(struct socket *xsk) +static inline void __kick_tx(struct socket *xsk) { int ret; ret = sendto(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, 0); @@ -120,7 +120,7 @@ static void __kick_tx(struct socket *xsk) exit(EXIT_FAILURE); } -static void __kick_rx(struct socket *xsk) +static inline void __kick_rx(struct socket *xsk) { int ret; ret = recvfrom(xsk->fd, NULL, 0, MSG_MORE, NULL, 0); @@ -131,12 +131,12 @@ static void __kick_rx(struct socket *xsk) exit(EXIT_FAILURE); } -static uint32_t fill_ring_nb_entries(struct socket *xsk) +static inline uint32_t fill_ring_nb_entries(struct socket *xsk) { return xsk->fill.size + (xsk->fill.cached_prod - xsk->fill.cached_cons); } -static uint32_t rx_ring_free_entries(struct socket *xsk) +static inline uint32_t rx_ring_free_entries(struct socket *xsk) { return xsk->rx.size - (xsk->rx.cached_prod - xsk->rx.cached_cons); } @@ -157,54 +157,6 @@ static inline void __try_kick_rx(struct config *cfg, struct socket *xsk) } } -static inline void __complete_tx_rx_first(struct config *cfg, struct socket *xsk) -{ - uint32_t idx_cq = 0, idx_fq = 0; - uint32_t completed, num_outstanding; - - if (!xsk->outstanding_tx) - return; - - /** - * In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to - * really send the packets. In zero-copy mode we do not have to do this, since Tx - * is driven by the NAPI loop. So as an optimization, we do not have to call - * sendto() all the time in zero-copy mode. - */ - if (cfg->xsk->bind_flags & XDP_COPY) { -#ifdef STATS - xsk->app_stats.copy_tx_sendtos++; -#endif - __kick_tx(xsk); - } - - num_outstanding = xsk->outstanding_tx > cfg->xsk->batch_size ? cfg->xsk->batch_size : xsk->outstanding_tx; - - /* Re-add completed TX buffers */ - completed = xsk_ring_cons__peek(&xsk->comp, num_outstanding, &idx_cq); - if (completed > 0) { - uint32_t i, ret; - - ret = xsk_ring_prod__reserve(&xsk->fill, completed, &idx_fq); - while (ret != completed) { - if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->fill)) { -#ifdef STATS - xsk->app_stats.fill_fail_polls++; -#endif - recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); - } - ret = xsk_ring_prod__reserve(&xsk->fill, completed, &idx_fq); - } - - for (i = 0; i < completed; i++) - *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = *xsk_ring_cons__comp_addr(&xsk->comp, idx_cq++); - - xsk_ring_prod__submit(&xsk->fill, completed); - xsk_ring_cons__release(&xsk->comp, completed); - xsk->outstanding_tx -= completed; - } -} - static inline void __complete_tx_completions(struct config *cfg, struct socket *xsk) { uint32_t idx_cq = 0, idx_fq = 0; @@ -310,32 +262,6 @@ static inline uint32_t __reserve_fq(struct config *cfg, struct socket *xsk, uint return idx_fq; } -static inline uint32_t __old_reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) -{ - uint32_t idx_tx = 0; - uint32_t ret; - - ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); - while (ret != num) { - __complete_tx_rx_first(cfg, xsk); - if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->tx)) { -#ifdef STATS - xsk->app_stats.tx_wakeup_sendtos++; -#endif - __kick_tx(xsk); - } - ret = xsk_ring_prod__reserve(&xsk->tx, num, &idx_tx); - - if (cfg->smart_poll && ret != num && xsk->outstanding_tx >= cfg->xsk->bp_thres) { - usleep(cfg->xsk->bp_timeout); -#ifdef STATS - xsk->app_stats.backpressure++; -#endif - } - } - return idx_tx; -} - static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint32_t num) { if (cfg->xsk->mode & FLASH__BUSY_POLL && xsk->outstanding_tx > cfg->xsk->bp_thres / 2) { @@ -359,7 +285,7 @@ static inline uint32_t __reserve_tx(struct config *cfg, struct socket *xsk, uint } if (cfg->smart_poll) { cfg->nf_pollout_status[cfg->nf_id] = 1; - ret = flash__oldpoll(xsk, &xsk->backpressure_fd, 1, 1000); + ret = __poll(xsk, &xsk->backpressure_fd, 1, 1000); cfg->nf_pollout_status[cfg->nf_id] = 0; // will wake up when any of the next_nf->fill_ring->buffers > next_nf->fill_ring->size / 2 // and next_nf->rx_ring_free > next_nf->rx_ring_size / 2 @@ -421,77 +347,6 @@ static void __hex_dump(void *pkt, size_t length, uint64_t addr) printf("\n"); } -size_t flash__oldrecvmsg(struct config *cfg, struct socket *xsk, struct xskmsghdr *msg) -{ - int ret; - uint32_t idx_rx = 0; - uint32_t rcvd, i, eop_cnt = 0; - - /* Only Tx currently is not supported - * in that scenario we need to call the following - * function somewhere else in the code - */ - __complete_tx_rx_first(cfg, xsk); - - if (cfg->smart_poll && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { - ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, -1); - if (ret <= 0) { - xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); - return 0; - } - } - - rcvd = xsk_ring_cons__peek(&xsk->rx, cfg->xsk->batch_size, &idx_rx); - if (!rcvd) { - if (cfg->xsk->mode & FLASH__BUSY_POLL || xsk_ring_prod__needs_wakeup(&xsk->fill)) { -#ifdef STATS - xsk->app_stats.rx_empty_polls++; -#endif - recvfrom(xsk->fd, NULL, 0, MSG_DONTWAIT, NULL, NULL); - } - - if (cfg->smart_poll && cfg->xsk->idle_timeout && !xsk->idle_timestamp) - xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); - - return 0; - } - - if (cfg->smart_poll && (rcvd >= cfg->xsk->idle_thres || xsk->outstanding_tx)) - xsk->idle_timestamp = 0; - - if (rcvd > cfg->xsk->batch_size) { - log_error("errno: %d/\"%s\"\n", errno, strerror(errno)); - exit(EXIT_FAILURE); - } - - for (i = 0; i < rcvd; i++) { - const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - eop_cnt += IS_EOP_DESC(desc->options); - uint64_t addr = desc->addr; - uint32_t len = desc->len; - uint64_t orig = addr; - - addr = xsk_umem__add_offset_to_addr(addr); - uint64_t *pkt = xsk_umem__get_data(cfg->umem->buffer, addr); - - // Put it in the message vector - msg->msg_iov[i].data = pkt; - msg->msg_iov[i].len = len; - msg->msg_iov[i].addr = orig; - msg->msg_iov[i].options = desc->options; - - __hex_dump(pkt, len, addr); - } - msg->msg_len = rcvd; - - xsk_ring_cons__release(&xsk->rx, rcvd); -#ifdef STATS - xsk->ring_stats.rx_npkts += eop_cnt; - xsk->ring_stats.rx_frags += rcvd; -#endif - return rcvd; -} - static inline void __replenish_fill_ring(struct config *cfg, struct socket *xsk, uint32_t num) { uint32_t ret, idx_fq = 0; @@ -536,7 +391,7 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk __complete_tx_completions(cfg, xsk); if ((cfg->smart_poll || cfg->sleep_poll) && cfg->xsk->idle_timeout && xsk->idle_timestamp && rdtsc() > xsk->idle_timestamp) { - ret = flash__oldpoll(xsk, &xsk->idle_fd, 1, -1); + ret = __poll(xsk, &xsk->idle_fd, 1, -1); if (ret <= 0) { xsk->idle_timestamp = rdtsc() + ((get_timer_hz(cfg) / MS_PER_S) * cfg->xsk->idle_timeout); return 0; @@ -598,49 +453,6 @@ size_t flash__recvmsg(struct config *cfg, struct socket *xsk, struct xskvec *xsk return rcvd; } -size_t flash__oldsendmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t nsend) -{ - uint32_t i; - uint32_t frags_done = 0, eop_cnt = 0; - uint32_t nb_frags = 0; - - if (!nsend) - return 0; - - uint32_t idx_tx = __reserve_tx(cfg, xsk, nsend); - - for (i = 0; i < nsend; i++) { - struct xskvec *xv = msgiov[i]; - bool eop = IS_EOP_DESC(xv->options); - uint64_t addr = xv->addr; - - uint32_t len = xv->len; - nb_frags++; - - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++); - - tx_desc->options = eop ? 0 : XDP_PKT_CONTD; - tx_desc->options |= (xv->options & 0xFFFF0000); - tx_desc->addr = addr; - tx_desc->len = len; - - __hex_dump(xv->data, xv->len, addr); - - if (eop) { - frags_done += nb_frags; - nb_frags = 0; - eop_cnt++; - } - } - xsk_ring_prod__submit(&xsk->tx, frags_done); - xsk->outstanding_tx += frags_done; -#ifdef STATS - xsk->ring_stats.tx_npkts += eop_cnt; - xsk->ring_stats.tx_frags += nsend; -#endif - return nsend; -} - size_t flash__sendmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t nsend) { bool eop; @@ -710,32 +522,6 @@ void flash__track_tx_and_drop(struct config *cfg, struct socket *xsk, struct xsk *ndrop = wdrop; } -size_t flash__olddropmsg(struct config *cfg, struct socket *xsk, struct xskvec **msgiov, uint32_t ndrop) -{ - uint32_t i; - uint32_t eop_cnt = 0; - - if (!ndrop) - return 0; - - uint32_t idx_fq = __reserve_fq(cfg, xsk, ndrop); - - for (i = 0; i < ndrop; i++) { - struct xskvec *xv = msgiov[i]; - uint64_t addr = xv->addr; - - uint64_t orig = xsk_umem__extract_addr(addr); - eop_cnt += IS_EOP_DESC(xv->options); - *xsk_ring_prod__fill_addr(&xsk->fill, idx_fq++) = orig; - } - - xsk_ring_prod__submit(&xsk->fill, ndrop); -#ifdef STATS - xsk->ring_stats.drop_npkts += ndrop; -#endif - return ndrop; -} - size_t flash__dropmsg(struct config *cfg, struct socket *xsk, struct xskvec *xskvecs, uint32_t ndrop) { uint32_t i, idx_fq; diff --git a/lib/flash/params/flash_params.c b/lib/flash/params/flash_params.c index 9488b8b..a333cab 100644 --- a/lib/flash/params/flash_params.c +++ b/lib/flash/params/flash_params.c @@ -49,9 +49,9 @@ const struct option_wrapper long_options[] = { { { "clock", required_argument, NULL, 'w' }, "Clock NAME (default MONOTONIC) -- not implemented yet", "", false }, - { { "track-outstanding-tx", no_argument, NULL, 'o' }, "Track outstanding Tx for each outgoing edge [default: false]", false }, + { { "track-tx", no_argument, NULL, 'o' }, "Track outstanding Tx for each outgoing edge [default: false]", false }, - { { "max-outstanding-tx", required_argument, NULL, 'O' }, + { { "max-tx", required_argument, NULL, 'O' }, "Maximum outstanding Tx packets for this NF (default: 256 (only in powers of 2))", "", false }, From b12ee189c690c40be7de0444c4254c536ca82db1 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Thu, 30 Oct 2025 23:46:35 +0530 Subject: [PATCH 38/43] feat: updated CI script for rust --- tools/ci_build.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/ci_build.sh b/tools/ci_build.sh index 99be3bd..7da4496 100755 --- a/tools/ci_build.sh +++ b/tools/ci_build.sh @@ -9,11 +9,13 @@ if [ $(id -u) -ne 0 ]; then fi $SUDO apt update -$SUDO apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev +$SUDO apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev libnuma-dev git clone https://github.com/xdp-project/xdp-tools.git -make -j -C xdp-tools libxdp -$SUDO make -j -C xdp-tools libxdp_install +make PREFIX=/usr -j -C xdp-tools libxdp +$SUDO PREFIX=/usr make -j -C xdp-tools libxdp_install + +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh meson setup build meson compile -C build From d52098bd572ea3f47e396cec5c696351da2ab4e2 Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Fri, 31 Oct 2025 01:25:14 +0530 Subject: [PATCH 39/43] feat(rust): added pollout, updated polling and backpressure - added pollout feature, updated polling logic - added backpressure fd, renamed poll_fd to idle - added kick_rx, updated kick to kick_tx - added try_kick_rx - renamed Fd to SocketFd - updated umem and nf id to usize - updated default backpressure sense to 1 - updated socket shared config --- lib/flash-rs/src/client.rs | 35 ++++-- lib/flash-rs/src/config/common.rs | 8 +- lib/flash-rs/src/config/config.rs | 5 +- lib/flash-rs/src/config/config_clap.rs | 14 ++- lib/flash-rs/src/config/mod.rs | 4 +- lib/flash-rs/src/config/poll.rs | 34 ++++-- lib/flash-rs/src/config/socket.rs | 27 +++++ lib/flash-rs/src/error.rs | 3 +- lib/flash-rs/src/fd/mod.rs | 6 +- lib/flash-rs/src/fd/{fd.rs => socket.rs} | 58 ++++----- lib/flash-rs/src/mem/mod.rs | 12 +- lib/flash-rs/src/mem/pollout.rs | 70 +++++++++++ lib/flash-rs/src/mem/ring/comp.rs | 4 +- lib/flash-rs/src/mem/ring/fill.rs | 19 +-- lib/flash-rs/src/mem/ring/rx.rs | 9 +- lib/flash-rs/src/mem/ring/tx.rs | 4 +- lib/flash-rs/src/uds/client.rs | 83 ++++++++++--- lib/flash-rs/src/uds/conn.rs | 2 +- lib/flash-rs/src/uds/def.rs | 4 +- lib/flash-rs/src/uds/error.rs | 12 ++ lib/flash-rs/src/util.rs | 8 +- lib/flash-rs/src/xsk/mod.rs | 3 - lib/flash-rs/src/xsk/shared.rs | 25 ---- lib/flash-rs/src/xsk/socket.rs | 147 +++++++++++++---------- 24 files changed, 399 insertions(+), 197 deletions(-) create mode 100644 lib/flash-rs/src/config/socket.rs rename lib/flash-rs/src/fd/{fd.rs => socket.rs} (70%) create mode 100644 lib/flash-rs/src/mem/pollout.rs delete mode 100644 lib/flash-rs/src/xsk/shared.rs diff --git a/lib/flash-rs/src/client.rs b/lib/flash-rs/src/client.rs index 2bbf1e7..183cd23 100644 --- a/lib/flash-rs/src/client.rs +++ b/lib/flash-rs/src/client.rs @@ -1,12 +1,12 @@ use std::{net::Ipv4Addr, str::FromStr, sync::Arc}; use crate::{ - config::{BindFlags, FlashConfig, Mode, PollConfig, XskConfig}, + config::{BindFlags, FlashConfig, Mode, PollConfig, SocketConfig, XskConfig}, error::FlashResult, - fd::Fd, - mem::Umem, + fd::SocketFd, + mem::{PollOutStatus, Umem}, uds::UdsClient, - xsk::{Socket, SocketShared}, + xsk::Socket, }; #[cfg(feature = "stats")] @@ -79,10 +79,10 @@ pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { ); #[cfg(feature = "stats")] - socket_info.push((Fd::new(fd), ifqueue)); + socket_info.push((SocketFd::new(fd), ifqueue)); #[cfg(not(feature = "stats"))] - socket_info.push(Fd::new(fd)); + socket_info.push(SocketFd::new(fd)); } #[cfg(feature = "stats")] @@ -91,11 +91,6 @@ pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { #[cfg(all(feature = "stats", feature = "tracing"))] tracing::debug!("Ifname: {ifname}"); - // let route_size = uds_client.get_route_info()?; - - // #[cfg(feature = "tracing")] - // tracing::debug!("Route Size: {route_size}"); - let route = Route { ip_addr: Ipv4Addr::from_str(&uds_client.get_ip_addr()?)?, next: uds_client @@ -108,8 +103,12 @@ pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { uds_client.set_nonblocking()?; let xsk_config = XskConfig::new(bind_flags, mode); + let next_size = uds_client.get_route_info()?; + let poll_config = PollConfig::new( config.smart_poll, + config.sleep_poll, + next_size != 0, config.idle_timeout, config.idleness, config.bp_timeout, @@ -117,7 +116,17 @@ pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { xsk_config.batch_size, )?; - let socket_shared = Arc::new(SocketShared::new(xsk_config, poll_config, uds_client)); + let (pollout_fd, pollout_size) = uds_client.get_pollout_status()?; + let prev_nf = uds_client.get_prev_nf()?; + + let pollout_status = PollOutStatus::new(pollout_fd, pollout_size, config.nf_id, prev_nf)?; + + let socket_config = Arc::new(SocketConfig::new( + xsk_config, + poll_config, + pollout_status, + uds_client, + )); let sockets = socket_info .into_iter() @@ -136,7 +145,7 @@ pub fn connect(config: &FlashConfig) -> FlashResult<(Vec, Route)> { umem_offset, #[cfg(feature = "stats")] Stats::new(fd, ifname.clone(), ifqueue, xdp_flags.clone()), - socket_shared.clone(), + socket_config.clone(), ) }) .collect::, _>>()?; diff --git a/lib/flash-rs/src/config/common.rs b/lib/flash-rs/src/config/common.rs index 8ddd391..cb9937b 100644 --- a/lib/flash-rs/src/config/common.rs +++ b/lib/flash-rs/src/config/common.rs @@ -3,11 +3,12 @@ use std::time::Duration; use super::FlashConfig; impl FlashConfig { - #[allow(clippy::must_use_candidate)] + #[allow(clippy::must_use_candidate, clippy::too_many_arguments)] pub fn new( - umem_id: u16, - nf_id: u16, + umem_id: usize, + nf_id: usize, smart_poll: bool, + sleep_poll: bool, idle_timeout: Duration, idleness: f32, bp_timeout: Duration, @@ -17,6 +18,7 @@ impl FlashConfig { umem_id, nf_id, smart_poll, + sleep_poll, idle_timeout, idleness, bp_timeout, diff --git a/lib/flash-rs/src/config/config.rs b/lib/flash-rs/src/config/config.rs index 1ab9cff..6eff0f6 100644 --- a/lib/flash-rs/src/config/config.rs +++ b/lib/flash-rs/src/config/config.rs @@ -2,9 +2,10 @@ use std::time::Duration; #[derive(Debug)] pub struct FlashConfig { - pub(crate) umem_id: u16, - pub(crate) nf_id: u16, + pub(crate) umem_id: usize, + pub(crate) nf_id: usize, pub(crate) smart_poll: bool, + pub(crate) sleep_poll: bool, pub(crate) idle_timeout: Duration, pub(crate) idleness: f32, pub(crate) bp_timeout: Duration, diff --git a/lib/flash-rs/src/config/config_clap.rs b/lib/flash-rs/src/config/config_clap.rs index 985ef14..eed8ff7 100644 --- a/lib/flash-rs/src/config/config_clap.rs +++ b/lib/flash-rs/src/config/config_clap.rs @@ -5,10 +5,10 @@ use clap::Parser; #[derive(Debug, Parser)] pub struct FlashConfig { #[arg(short, long, help = "Umem id used to connect to monitor")] - pub(crate) umem_id: u16, + pub(crate) umem_id: usize, #[arg(short = 'f', long, help = "NF id used to connect to monitor")] - pub(crate) nf_id: u16, + pub(crate) nf_id: usize, #[arg( short = 'p', @@ -18,6 +18,14 @@ pub struct FlashConfig { )] pub(crate) smart_poll: bool, + #[arg( + short = 'p', + long, + default_value_t = false, + help = "Enable periodic sleep mode" + )] + pub(crate) sleep_poll: bool, + #[arg( short, long, @@ -46,7 +54,7 @@ pub struct FlashConfig { #[arg( short = 'B', long, - default_value_t = 0.5, + default_value_t = 1.0, help = "Backpressure sensitivity [0.0 = low (0 pkts), 1.0 = high (2048 pkts)]" )] pub(crate) bp_sense: f32, diff --git a/lib/flash-rs/src/config/mod.rs b/lib/flash-rs/src/config/mod.rs index f76e30b..30de253 100644 --- a/lib/flash-rs/src/config/mod.rs +++ b/lib/flash-rs/src/config/mod.rs @@ -1,6 +1,7 @@ mod common; mod error; mod poll; +mod socket; mod xsk; #[cfg_attr(feature = "clap", path = "config_clap.rs")] @@ -8,7 +9,8 @@ mod xsk; mod config; pub(crate) use { - poll::PollConfig, + poll::{PollConfig, PollMode}, + socket::SocketConfig, xsk::{BindFlags, Mode, XskConfig}, }; diff --git a/lib/flash-rs/src/config/poll.rs b/lib/flash-rs/src/config/poll.rs index 2bb5702..7f34e1e 100644 --- a/lib/flash-rs/src/config/poll.rs +++ b/lib/flash-rs/src/config/poll.rs @@ -4,8 +4,17 @@ use libxdp_sys::XSK_RING_PROD__DEFAULT_NUM_DESCS; use super::error::{ConfigError, ConfigResult}; +#[derive(Debug, PartialEq)] +pub(crate) enum PollMode { + Smart, + Sleep, + None, +} + #[derive(Debug)] pub(crate) struct PollConfig { + pub(crate) mode: PollMode, + pub(crate) next_not_empty: bool, pub(crate) idle_timeout: Duration, pub(crate) idle_threshold: u32, pub(crate) bp_timeout: Duration, @@ -16,32 +25,41 @@ impl PollConfig { #[allow( clippy::cast_possible_truncation, clippy::cast_precision_loss, - clippy::cast_sign_loss + clippy::cast_sign_loss, + clippy::too_many_arguments )] pub(crate) fn new( smart_poll: bool, + sleep_poll: bool, + next_not_empty: bool, idle_timeout: Duration, idleness: f32, bp_timeout: Duration, bp_sense: f32, batch_size: u32, - ) -> ConfigResult> { - if !smart_poll { - return Ok(None); - } - + ) -> ConfigResult { if idle_timeout == Duration::ZERO { return Err(ConfigError::InvalidIdleTimeout); } + let mode = if smart_poll { + PollMode::Smart + } else if sleep_poll { + PollMode::Sleep + } else { + PollMode::None + }; + let idle_threshold = (batch_size as f32 * idleness) as u32; let bp_threshold = (XSK_RING_PROD__DEFAULT_NUM_DESCS as f32 * bp_sense) as u32; - Ok(Some(Self { + Ok(Self { + mode, + next_not_empty, idle_timeout, idle_threshold, bp_timeout, bp_threshold, - })) + }) } } diff --git a/lib/flash-rs/src/config/socket.rs b/lib/flash-rs/src/config/socket.rs new file mode 100644 index 0000000..e65889f --- /dev/null +++ b/lib/flash-rs/src/config/socket.rs @@ -0,0 +1,27 @@ +use crate::{mem::PollOutStatus, uds::UdsClient}; + +use super::{poll::PollConfig, xsk::XskConfig}; + +#[derive(Debug)] +pub(crate) struct SocketConfig { + pub(crate) xsk: XskConfig, + pub(crate) poll: PollConfig, + pub(crate) pollout_status: PollOutStatus, + _uds_client: UdsClient, +} + +impl SocketConfig { + pub(crate) fn new( + xsk: XskConfig, + poll: PollConfig, + pollout_status: PollOutStatus, + uds_client: UdsClient, + ) -> Self { + Self { + xsk, + poll, + pollout_status, + _uds_client: uds_client, + } + } +} diff --git a/lib/flash-rs/src/error.rs b/lib/flash-rs/src/error.rs index 9d4b926..a5b67cc 100644 --- a/lib/flash-rs/src/error.rs +++ b/lib/flash-rs/src/error.rs @@ -1,6 +1,6 @@ use std::{io, net::AddrParseError}; -use crate::{config::ConfigError, fd::FdError, uds::UdsError, xsk::SocketError}; +use crate::{config::ConfigError, fd::FdError, mem::MemError, uds::UdsError, xsk::SocketError}; pub(crate) type FlashResult = Result; @@ -12,6 +12,7 @@ pub enum FlashError { Config(#[from] ConfigError), Fd(#[from] FdError), + Mem(#[from] MemError), Socket(#[from] SocketError), UDS(#[from] UdsError), } diff --git a/lib/flash-rs/src/fd/mod.rs b/lib/flash-rs/src/fd/mod.rs index d2b3046..cf9e9d7 100644 --- a/lib/flash-rs/src/fd/mod.rs +++ b/lib/flash-rs/src/fd/mod.rs @@ -1,9 +1,7 @@ mod error; +mod socket; mod xdp; -#[allow(clippy::module_inception)] -mod fd; - -pub(crate) use fd::Fd; +pub(crate) use socket::SocketFd; pub use error::FdError; diff --git a/lib/flash-rs/src/fd/fd.rs b/lib/flash-rs/src/fd/socket.rs similarity index 70% rename from lib/flash-rs/src/fd/fd.rs rename to lib/flash-rs/src/fd/socket.rs index 47d118f..acd5ca7 100644 --- a/lib/flash-rs/src/fd/fd.rs +++ b/lib/flash-rs/src/fd/socket.rs @@ -1,16 +1,11 @@ use std::{fmt, io, ptr}; -use libc::{ - EAGAIN, EBUSY, ENETDOWN, ENOBUFS, MSG_DONTWAIT, SOL_XDP, XDP_MMAP_OFFSETS, pollfd, ssize_t, -}; +use libc::{MSG_DONTWAIT, MSG_MORE, SOL_XDP, XDP_MMAP_OFFSETS, pollfd}; #[cfg(feature = "stats")] use libc::XDP_STATISTICS; -use crate::{ - mem::{MemError, Mmap}, - util, -}; +use crate::mem::{MemError, Mmap}; use super::{ error::{FdError, FdResult}, @@ -21,31 +16,35 @@ use super::{ use super::xdp::{XDP_STATISTICS_SIZEOF, XdpStatistics}; #[derive(Clone)] -pub(crate) struct Fd { +pub(crate) struct SocketFd { id: i32, - poll_fd: pollfd, - // poll_timeout: i32, + idle: pollfd, + backpressure: pollfd, } #[allow(clippy::missing_fields_in_debug)] -impl fmt::Debug for Fd { +impl fmt::Debug for SocketFd { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Fd").field("id", &self.id).finish() } } -impl Fd { +impl SocketFd { pub(crate) fn new(id: i32) -> Self { assert!(id >= 0, "fd error: invalid file descriptor: {id}"); - Fd { + Self { id, - poll_fd: pollfd { + idle: pollfd { fd: id, events: libc::POLLIN, revents: 0, }, - // poll_timeout, + backpressure: pollfd { + fd: id, + events: libc::POLLOUT, + revents: 0, + }, } } @@ -55,17 +54,13 @@ impl Fd { } #[inline] - pub(crate) fn kick(&self) -> Result { - let n = unsafe { libc::sendto(self.id, ptr::null(), 0, MSG_DONTWAIT, ptr::null(), 0) }; + pub(crate) fn kick_tx(&self) { + unsafe { libc::sendto(self.id, ptr::null(), 0, MSG_DONTWAIT, ptr::null(), 0) }; + } - if n >= 0 { - Ok(n) - } else { - match util::get_errno() { - ENOBUFS | EAGAIN | EBUSY | ENETDOWN => Ok(0), - _ => Err(()), - } - } + #[inline] + pub(crate) fn kick_rx(&self) { + unsafe { libc::sendto(self.id, ptr::null(), 0, MSG_MORE, ptr::null(), 0) }; } #[inline] @@ -83,8 +78,17 @@ impl Fd { } #[inline] - pub(crate) fn poll(&mut self) -> io::Result { - match unsafe { libc::poll(&raw mut self.poll_fd, 1, -1) } { + pub(crate) fn poll_idle(&mut self) -> io::Result { + match unsafe { libc::poll(&raw mut self.idle, 1, -1) } { + -1 => Err(io::Error::last_os_error()), + 0 => Ok(false), + _ => Ok(true), + } + } + + #[inline] + pub(crate) fn poll_backpressure(&mut self) -> io::Result { + match unsafe { libc::poll(&raw mut self.backpressure, 1, 1000) } { -1 => Err(io::Error::last_os_error()), 0 => Ok(false), _ => Ok(true), diff --git a/lib/flash-rs/src/mem/mod.rs b/lib/flash-rs/src/mem/mod.rs index f6e16c4..d70ce50 100644 --- a/lib/flash-rs/src/mem/mod.rs +++ b/lib/flash-rs/src/mem/mod.rs @@ -1,16 +1,20 @@ mod desc; mod error; mod mmap; +mod pollout; mod ring; mod umem; #[cfg(feature = "pool")] mod pool; -pub(crate) use desc::Desc; -pub(crate) use mmap::Mmap; -pub(crate) use ring::{CompRing, Cons, FillRing, Prod, RxRing, TxRing}; -pub(crate) use umem::Umem; +pub(crate) use { + desc::Desc, + mmap::Mmap, + pollout::PollOutStatus, + ring::{CompRing, Cons, FillRing, Prod, RxRing, TxRing}, + umem::Umem, +}; #[cfg(feature = "pool")] pub(crate) use pool::Pool; diff --git a/lib/flash-rs/src/mem/pollout.rs b/lib/flash-rs/src/mem/pollout.rs new file mode 100644 index 0000000..8933ea9 --- /dev/null +++ b/lib/flash-rs/src/mem/pollout.rs @@ -0,0 +1,70 @@ +use std::{ + ptr::{self, NonNull}, + slice, +}; + +use libc::c_void; + +use super::error::{MemError, MemResult}; + +#[derive(Debug)] +pub(crate) struct PollOutStatus { + addr: NonNull, + size: usize, + status: NonNull, + nf_id: usize, + prev_nf: Vec, +} + +unsafe impl Send for PollOutStatus {} +unsafe impl Sync for PollOutStatus {} + +impl PollOutStatus { + pub(crate) fn new(fd: i32, size: usize, nf_id: usize, prev_nf: Vec) -> MemResult { + let addr = unsafe { + libc::mmap( + ptr::null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED, + fd, + 0, + ) + }; + + let Some(addr) = NonNull::new(addr) else { + return Err(MemError::last_os_error()); + }; + + let status = unsafe { NonNull::new_unchecked(addr.as_ptr().cast::()) }; + + Ok(Self { + addr, + size, + status, + nf_id, + prev_nf, + }) + } + + #[inline] + pub(crate) fn set(&self, bool: bool) { + unsafe { + *self.status.as_ptr().add(self.nf_id) = u8::from(bool); + } + } + + #[inline] + pub(crate) fn any(&self) -> bool { + let status = unsafe { slice::from_raw_parts(self.status.as_ptr(), self.size) }; + self.prev_nf.iter().any(|&prev_id| status[prev_id] != 0) + } +} + +impl Drop for PollOutStatus { + fn drop(&mut self) { + unsafe { + libc::munmap(self.addr.as_ptr(), self.size); + } + } +} diff --git a/lib/flash-rs/src/mem/ring/comp.rs b/lib/flash-rs/src/mem/ring/comp.rs index bd18152..93c094e 100644 --- a/lib/flash-rs/src/mem/ring/comp.rs +++ b/lib/flash-rs/src/mem/ring/comp.rs @@ -6,7 +6,7 @@ use libxdp_sys::{ xsk_ring_cons__release, }; -use crate::{fd::Fd, mem::Mmap}; +use crate::{fd::SocketFd, mem::Mmap}; use super::{Cons, error::RingResult}; @@ -20,7 +20,7 @@ unsafe impl Send for CompRing {} impl CompRing { #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + pub(crate) fn new(fd: &SocketFd, off: &xdp_ring_offset, scale: u32) -> RingResult { let comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * scale; let mmap = fd.mmap( diff --git a/lib/flash-rs/src/mem/ring/fill.rs b/lib/flash-rs/src/mem/ring/fill.rs index d9c8ec0..d383c59 100644 --- a/lib/flash-rs/src/mem/ring/fill.rs +++ b/lib/flash-rs/src/mem/ring/fill.rs @@ -7,7 +7,7 @@ use libxdp_sys::{ }; use crate::{ - fd::Fd, + fd::SocketFd, mem::{FRAME_SIZE, Mmap}, }; @@ -23,7 +23,7 @@ unsafe impl Send for FillRing {} impl FillRing { #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + pub(crate) fn new(fd: &SocketFd, off: &xdp_ring_offset, scale: u32) -> RingResult { let fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2 * scale; let mmap = fd.mmap( @@ -48,11 +48,6 @@ impl FillRing { }) } - #[inline] - pub(crate) fn addr(&mut self, idx: u32) -> Option<&mut u64> { - unsafe { xsk_ring_prod__fill_addr(&raw mut self.ring, idx).as_mut() } - } - pub(crate) fn populate(&mut self, scale: u32, offset: u64) -> RingResult<()> { let frame_size = u64::from(FRAME_SIZE); let nr_frames = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; @@ -75,6 +70,16 @@ impl FillRing { self.submit(nr_frames); Ok(()) } + + #[inline] + pub(crate) fn addr(&mut self, idx: u32) -> Option<&mut u64> { + unsafe { xsk_ring_prod__fill_addr(&raw mut self.ring, idx).as_mut() } + } + + #[inline] + pub(crate) fn is_half_full(&mut self) -> bool { + self.ring.cached_cons - self.ring.cached_prod <= self.ring.size / 2 + } } impl Prod for FillRing { diff --git a/lib/flash-rs/src/mem/ring/rx.rs b/lib/flash-rs/src/mem/ring/rx.rs index 6096950..967a914 100644 --- a/lib/flash-rs/src/mem/ring/rx.rs +++ b/lib/flash-rs/src/mem/ring/rx.rs @@ -6,7 +6,7 @@ use libxdp_sys::{ xsk_ring_cons__release, xsk_ring_cons__rx_desc, }; -use crate::{fd::Fd, mem::Mmap}; +use crate::{fd::SocketFd, mem::Mmap}; use super::{Cons, error::RingResult}; @@ -20,7 +20,7 @@ unsafe impl Send for RxRing {} impl RxRing { #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + pub(crate) fn new(fd: &SocketFd, off: &xdp_ring_offset, scale: u32) -> RingResult { let rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS * scale; let mmap = fd.mmap( @@ -49,6 +49,11 @@ impl RxRing { pub(crate) fn desc(&self, idx: u32) -> Option<&xdp_desc> { unsafe { xsk_ring_cons__rx_desc(&raw const self.ring, idx).as_ref() } } + + #[inline] + pub(crate) fn is_half_empty(&mut self) -> bool { + self.ring.cached_prod - self.ring.cached_cons < self.ring.size / 2 + } } impl Cons for RxRing { diff --git a/lib/flash-rs/src/mem/ring/tx.rs b/lib/flash-rs/src/mem/ring/tx.rs index bc2c37c..f111e93 100644 --- a/lib/flash-rs/src/mem/ring/tx.rs +++ b/lib/flash-rs/src/mem/ring/tx.rs @@ -6,7 +6,7 @@ use libxdp_sys::{ xsk_ring_prod__reserve, xsk_ring_prod__submit, xsk_ring_prod__tx_desc, }; -use crate::{fd::Fd, mem::Mmap}; +use crate::{fd::SocketFd, mem::Mmap}; use super::{Prod, error::RingResult}; @@ -20,7 +20,7 @@ unsafe impl Send for TxRing {} impl TxRing { #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] - pub(crate) fn new(fd: &Fd, off: &xdp_ring_offset, scale: u32) -> RingResult { + pub(crate) fn new(fd: &SocketFd, off: &xdp_ring_offset, scale: u32) -> RingResult { let tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * scale; let mmap = fd.mmap( diff --git a/lib/flash-rs/src/uds/client.rs b/lib/flash-rs/src/uds/client.rs index 3e755fb..b0cb728 100644 --- a/lib/flash-rs/src/uds/client.rs +++ b/lib/flash-rs/src/uds/client.rs @@ -1,14 +1,14 @@ -use crate::{uds::error::UdsError, util}; +use crate::util; use super::{ conn::UdsConn, def::{ FLASH_CLOSE_CONN, FLASH_CREATE_SOCKET, FLASH_GET_BIND_FLAGS, FLASH_GET_DST_IP_ADDR, FLASH_GET_FRAGS_ENABLED, FLASH_GET_IFNAME, FLASH_GET_IP_ADDR, FLASH_GET_MODE, - FLASH_GET_POLL_TIMEOUT, FLASH_GET_ROUTE_INFO, FLASH_GET_UMEM, FLASH_GET_UMEM_OFFSET, - FLASH_GET_XDP_FLAGS, + FLASH_GET_POLLOUT_STATUS, FLASH_GET_PREV_NF, FLASH_GET_ROUTE_INFO, FLASH_GET_UMEM, + FLASH_GET_UMEM_OFFSET, FLASH_GET_XDP_FLAGS, }, - error::UdsResult, + error::{UdsError, UdsResult}, }; const FLASH_UNIX_SOCKET_PATH: &str = "/tmp/flash/uds.sock"; @@ -26,11 +26,15 @@ impl UdsClient { }) } - #[allow(clippy::similar_names)] + #[allow( + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::similar_names + )] pub(crate) fn get_umem( &mut self, - umem_id: u16, - nf_id: u16, + umem_id: usize, + nf_id: usize, ) -> UdsResult<(i32, usize, usize, u32)> { #[repr(C)] struct NfData { @@ -40,8 +44,8 @@ impl UdsClient { self.conn.write_all(&FLASH_GET_UMEM)?; self.conn.write_all(util::as_bytes(&NfData { - umem_id: i32::from(umem_id), - nf_id: i32::from(nf_id), + umem_id: umem_id as i32, + nf_id: nf_id as i32, }))?; let umem_fd = self.conn.recv_fd()?; @@ -124,10 +128,10 @@ impl UdsClient { Ok(self.conn.recv_u32()?) } - pub(crate) fn get_poll_timeout(&mut self) -> UdsResult { - self.conn.write_all(&FLASH_GET_POLL_TIMEOUT)?; - Ok(self.conn.recv_i32()?) - } + // pub(crate) fn get_poll_timeout(&mut self) -> UdsResult { + // self.conn.write_all(&FLASH_GET_POLL_TIMEOUT)?; + // Ok(self.conn.recv_i32()?) + // } pub(crate) fn get_frags_enabled(&mut self) -> UdsResult { self.conn.write_all(&FLASH_GET_FRAGS_ENABLED)?; @@ -146,15 +150,56 @@ impl UdsClient { pub(crate) fn get_dst_ip_addr(&mut self) -> UdsResult> { self.conn.write_all(&FLASH_GET_DST_IP_ADDR)?; - let dst_size = self.conn.recv_i32()?; + let dst_size = self.conn.recv_i32()?; if dst_size < 0 { - Err(UdsError::InvalidNextSize) - } else { - Ok((0..dst_size) - .map(|_| self.conn.recv_string::<16>()) - .collect::, _>>()?) + return Err(UdsError::InvalidNextSize); + } + + let mut dst_ip_addr = Vec::with_capacity(dst_size as usize); + for _ in 0..dst_size { + dst_ip_addr.push(self.conn.recv_string::<16>()?); + } + + Ok(dst_ip_addr) + } + + pub(crate) fn get_pollout_status(&mut self) -> UdsResult<(i32, usize)> { + self.conn.write_all(&FLASH_GET_POLLOUT_STATUS)?; + + let pollout_fd = self.conn.recv_fd()?; + if pollout_fd < 0 { + return Err(UdsError::InvalidPollOutFd); } + + let pollout_size = self.conn.recv_i32()?; + if pollout_size < 0 { + return Err(UdsError::InvalidPollOutSize); + } + + Ok((pollout_fd, pollout_size as usize)) + } + + #[allow(clippy::cast_possible_truncation)] + pub(crate) fn get_prev_nf(&mut self) -> UdsResult> { + self.conn.write_all(&FLASH_GET_PREV_NF)?; + + let prev_size = self.conn.recv_i32()?; + if prev_size < 0 { + return Err(UdsError::InvalidPrevSize); + } + + let mut prev_nf_ids = Vec::with_capacity(prev_size as usize); + for _ in 0..prev_size { + let prev_nf = self.conn.recv_i32()?; + if prev_nf < 0 { + return Err(UdsError::InvalidPrevNfId); + } + + prev_nf_ids.push(prev_nf as usize); + } + + Ok(prev_nf_ids) } pub(crate) fn set_nonblocking(&mut self) -> UdsResult<()> { diff --git a/lib/flash-rs/src/uds/conn.rs b/lib/flash-rs/src/uds/conn.rs index 9a580a3..888e745 100644 --- a/lib/flash-rs/src/uds/conn.rs +++ b/lib/flash-rs/src/uds/conn.rs @@ -1,5 +1,5 @@ use std::{ - io::{self, Read, Write as _}, + io::{self, Read as _, Write as _}, os::unix::net::UnixStream, path::Path, }; diff --git a/lib/flash-rs/src/uds/def.rs b/lib/flash-rs/src/uds/def.rs index 172d1a5..036fae3 100644 --- a/lib/flash-rs/src/uds/def.rs +++ b/lib/flash-rs/src/uds/def.rs @@ -14,8 +14,10 @@ flash_command!(FLASH_GET_ROUTE_INFO, 7); flash_command!(FLASH_GET_BIND_FLAGS, 8); flash_command!(FLASH_GET_XDP_FLAGS, 9); flash_command!(FLASH_GET_MODE, 10); -flash_command!(FLASH_GET_POLL_TIMEOUT, 11); +// flash_command!(FLASH_GET_POLL_TIMEOUT, 11); flash_command!(FLASH_GET_FRAGS_ENABLED, 12); flash_command!(FLASH_GET_IFNAME, 13); flash_command!(FLASH_GET_IP_ADDR, 14); flash_command!(FLASH_GET_DST_IP_ADDR, 15); +flash_command!(FLASH_GET_POLLOUT_STATUS, 16); +flash_command!(FLASH_GET_PREV_NF, 17); diff --git a/lib/flash-rs/src/uds/error.rs b/lib/flash-rs/src/uds/error.rs index 5c5bd7a..6a44591 100644 --- a/lib/flash-rs/src/uds/error.rs +++ b/lib/flash-rs/src/uds/error.rs @@ -39,4 +39,16 @@ pub enum UdsError { #[error("uds error: invalid xdp flags")] InvalidXdpFlags, + + #[error("uds error: invalid pollout fd")] + InvalidPollOutFd, + + #[error("uds error: invalid pollout size")] + InvalidPollOutSize, + + #[error("uds error: invalid prev size")] + InvalidPrevSize, + + #[error("uds error: invalid prev nf id")] + InvalidPrevNfId, } diff --git a/lib/flash-rs/src/util.rs b/lib/flash-rs/src/util.rs index 9481766..239a444 100644 --- a/lib/flash-rs/src/util.rs +++ b/lib/flash-rs/src/util.rs @@ -1,9 +1,9 @@ use std::{mem, ptr, slice}; -#[inline] -pub(crate) fn get_errno() -> i32 { - unsafe { *libc::__errno_location() } -} +// #[inline] +// pub(crate) fn get_errno() -> i32 { +// unsafe { *libc::__errno_location() } +// } #[inline] pub(crate) fn as_bytes(data: &T) -> &[u8] { diff --git a/lib/flash-rs/src/xsk/mod.rs b/lib/flash-rs/src/xsk/mod.rs index b22fee0..61e11d8 100644 --- a/lib/flash-rs/src/xsk/mod.rs +++ b/lib/flash-rs/src/xsk/mod.rs @@ -1,7 +1,4 @@ mod error; -mod shared; mod socket; -pub(crate) use shared::SocketShared; - pub use {error::SocketError, socket::Socket}; diff --git a/lib/flash-rs/src/xsk/shared.rs b/lib/flash-rs/src/xsk/shared.rs deleted file mode 100644 index c9fb85d..0000000 --- a/lib/flash-rs/src/xsk/shared.rs +++ /dev/null @@ -1,25 +0,0 @@ -use crate::{ - config::{PollConfig, XskConfig}, - uds::UdsClient, -}; - -#[derive(Debug)] -pub(crate) struct SocketShared { - pub(super) xsk_config: XskConfig, - pub(super) poll_config: Option, - pub(super) _uds_client: UdsClient, -} - -impl SocketShared { - pub(crate) fn new( - xsk_config: XskConfig, - poll_config: Option, - uds_client: UdsClient, - ) -> Self { - Self { - xsk_config, - poll_config, - _uds_client: uds_client, - } - } -} diff --git a/lib/flash-rs/src/xsk/socket.rs b/lib/flash-rs/src/xsk/socket.rs index 8da1b48..0034321 100644 --- a/lib/flash-rs/src/xsk/socket.rs +++ b/lib/flash-rs/src/xsk/socket.rs @@ -3,8 +3,8 @@ use std::{sync::Arc, thread}; use quanta::{Clock, Instant}; use crate::{ - config::{BindFlags, Mode}, - fd::Fd, + config::{BindFlags, Mode, PollMode, SocketConfig}, + fd::SocketFd, mem::{CompRing, Cons as _, Desc, FillRing, Prod as _, RxRing, TxRing, Umem}, }; @@ -14,14 +14,11 @@ use crate::mem::Pool; #[cfg(feature = "stats")] use crate::stats::Stats; -use super::{ - error::{SocketError, SocketResult}, - shared::SocketShared, -}; +use super::error::{SocketError, SocketResult}; #[derive(Debug)] pub struct Socket { - fd: Fd, + fd: SocketFd, umem: Umem, fill: FillRing, comp: CompRing, @@ -34,7 +31,7 @@ pub struct Socket { outstanding_tx: u32, clock: Clock, idle_timestamp: Option, - shared: Arc, + config: Arc, #[cfg(feature = "stats")] stats: Arc, @@ -42,13 +39,13 @@ pub struct Socket { impl Socket { pub(crate) fn new( - fd: Fd, + fd: SocketFd, umem: Umem, idx: usize, umem_scale: u32, umem_offset: u64, #[cfg(feature = "stats")] stats: Stats, - data: Arc, + config: Arc, ) -> SocketResult { let off = fd.xdp_mmap_offsets()?; @@ -57,11 +54,10 @@ impl Socket { let rx = RxRing::new(&fd, off.rx(), umem_scale)?; let tx = TxRing::new(&fd, off.tx(), umem_scale)?; - #[cfg(feature = "pool")] - fill.populate(umem_scale, idx as u64 + umem_offset)?; - #[cfg(not(feature = "pool"))] - fill.populate(2 * umem_scale, idx as u64 + umem_offset)?; + let umem_scale = 2 * umem_scale; + + fill.populate(umem_scale, idx as u64 + umem_offset)?; Ok(Self { fd, @@ -75,11 +71,9 @@ impl Socket { pool: Pool::new(umem_scale, idx as u64 + umem_offset), outstanding_tx: 0, - clock: Clock::new(), idle_timestamp: None, - - shared: data, + config, #[cfg(feature = "stats")] stats: Arc::new(stats), @@ -89,17 +83,28 @@ impl Socket { #[allow(clippy::missing_errors_doc)] #[inline] pub fn poll(&mut self) -> SocketResult { - if self.shared.xsk_config.mode.contains(Mode::FLASH_POLL) { + if self.config.xsk.mode.contains(Mode::FLASH_POLL) { #[cfg(feature = "stats")] unsafe { (*self.stats.app.get()).opt_polls += 1; } - Ok(self.fd.poll()?) + Ok(self.fd.poll_idle()?) } else { Ok(true) } } + #[inline] + fn try_kick_rx(&mut self) { + if self.config.poll.mode == PollMode::Smart + && self.config.pollout_status.any() + && self.fill.is_half_full() + && self.rx.is_half_empty() + { + self.fd.kick_rx(); + } + } + #[allow(clippy::similar_names)] #[inline] fn complete_tx_rx(&mut self) { @@ -107,20 +112,15 @@ impl Socket { return; } - if self - .shared - .xsk_config - .bind_flags - .contains(BindFlags::XDP_COPY) - { + if self.config.xsk.bind_flags.contains(BindFlags::XDP_COPY) { #[cfg(feature = "stats")] unsafe { (*self.stats.app.get()).tx_copy_sendtos += 1; } - let _ = self.fd.kick(); + self.fd.kick_tx(); } - let num_outstanding = self.outstanding_tx.min(self.shared.xsk_config.batch_size); + let num_outstanding = self.outstanding_tx.min(self.config.xsk.batch_size); let mut idx_cq = 0; let completed = self.comp.peek(num_outstanding, &mut idx_cq); @@ -140,8 +140,7 @@ impl Socket { { let mut idx_fq = 0; while self.fill.reserve(completed, &mut idx_fq) != completed { - if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) - || self.fill.needs_wakeup() + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) || self.fill.needs_wakeup() { #[cfg(feature = "stats")] unsafe { @@ -163,6 +162,7 @@ impl Socket { } self.fill.submit(completed); + self.try_kick_rx(); } self.comp.release(completed); @@ -172,11 +172,8 @@ impl Socket { #[inline] fn reserve_fq(&mut self, num: u32) -> u32 { let mut idx_fq = 0; - while self.fill.reserve(num, &mut idx_fq) != num { - if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) - || self.fill.needs_wakeup() - { + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) || self.fill.needs_wakeup() { #[cfg(feature = "stats")] unsafe { (*self.stats.app.get()).fill_fail_polls += 1; @@ -190,39 +187,61 @@ impl Socket { #[inline] fn reserve_tx(&mut self, num: u32) -> u32 { - let mut idx_tx = 0; - if self.tx.reserve(num, &mut idx_tx) == num { - return idx_tx; + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) + && self.outstanding_tx > self.config.poll.bp_threshold / 2 + { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).tx_wakeup_sendtos += 1; + } + self.complete_tx_rx(); + self.fd.kick_tx(); } - loop { - self.complete_tx_rx(); + if self.config.poll.mode != PollMode::None && self.config.poll.next_not_empty { + while self.outstanding_tx + num > self.config.poll.bp_threshold { + self.complete_tx_rx(); - if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) || self.tx.needs_wakeup() - { + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) || self.tx.needs_wakeup() { + #[cfg(feature = "stats")] + unsafe { + (*self.stats.app.get()).tx_wakeup_sendtos += 1; + } + self.fd.kick_tx(); + } + + match self.config.poll.mode { + PollMode::Smart => { + self.config.pollout_status.set(true); + let _ = self.fd.poll_backpressure(); + self.config.pollout_status.set(false); + } + PollMode::Sleep => thread::sleep(self.config.poll.bp_timeout), + PollMode::None => {} + } + + self.idle_timestamp = None; #[cfg(feature = "stats")] unsafe { - (*self.stats.app.get()).tx_wakeup_sendtos += 1; + (*self.stats.app.get()).backpressure += 1; } - let _ = self.fd.kick(); - } - - if self.tx.reserve(num, &mut idx_tx) == num { - return idx_tx; } + } - if let Some(poll_config) = &self.shared.poll_config - && self.outstanding_tx >= poll_config.bp_threshold - { - thread::sleep(poll_config.bp_timeout); - self.idle_timestamp = None; + let mut idx_tx = 0; + while self.tx.reserve(num, &mut idx_tx) != num { + self.complete_tx_rx(); + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) || self.tx.needs_wakeup() { #[cfg(feature = "stats")] unsafe { - (*self.stats.app.get()).backpressure += 1; + (*self.stats.app.get()).tx_wakeup_sendtos += 1; } + self.fd.kick_tx(); } } + + idx_tx } #[cfg(feature = "pool")] @@ -266,22 +285,20 @@ impl Socket { pub fn recv(&mut self) -> SocketResult> { self.complete_tx_rx(); - if let Some(poll_config) = &self.shared.poll_config + if self.config.poll.mode != PollMode::None && let Some(idle_timestamp) = self.idle_timestamp && self.clock.now() >= idle_timestamp - && !self.fd.poll()? + && !self.fd.poll_idle()? { - self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); + self.idle_timestamp = self.clock.now().checked_add(self.config.poll.idle_timeout); return Ok(vec![]); } let mut idx_rx = 0; - let rcvd = self.rx.peek(self.shared.xsk_config.batch_size, &mut idx_rx); + let rcvd = self.rx.peek(self.config.xsk.batch_size, &mut idx_rx); if rcvd == 0 { - if self.shared.xsk_config.mode.contains(Mode::FLASH_BUSY_POLL) - || self.fill.needs_wakeup() - { + if self.config.xsk.mode.contains(Mode::FLASH_BUSY_POLL) || self.fill.needs_wakeup() { #[cfg(feature = "stats")] unsafe { (*self.stats.app.get()).rx_empty_polls += 1; @@ -289,17 +306,15 @@ impl Socket { self.fd.wakeup(); } - if let Some(poll_config) = &self.shared.poll_config - && self.idle_timestamp.is_none() - { - self.idle_timestamp = self.clock.now().checked_add(poll_config.idle_timeout); + if self.config.poll.mode != PollMode::None && self.idle_timestamp.is_none() { + self.idle_timestamp = self.clock.now().checked_add(self.config.poll.idle_timeout); } return Ok(vec![]); } - if let Some(poll_config) = &self.shared.poll_config - && (rcvd >= poll_config.idle_threshold || self.outstanding_tx > 0) + if self.config.poll.mode != PollMode::None + && (rcvd >= self.config.poll.idle_threshold || self.outstanding_tx > 0) { self.idle_timestamp = None; } @@ -317,6 +332,7 @@ impl Socket { self.replenish_fq(rcvd); self.rx.release(rcvd); + self.try_kick_rx(); #[cfg(feature = "stats")] unsafe { @@ -413,6 +429,7 @@ impl Socket { } self.fill.submit(n); + self.try_kick_rx(); #[cfg(feature = "stats")] unsafe { From 78edc416cd27fcc1a6620dab0f02591466007aa7 Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Fri, 31 Oct 2025 02:09:52 +0530 Subject: [PATCH 40/43] fix(rust): updated Fd to SocketFd in stats --- lib/flash-rs/src/stats/stats.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/flash-rs/src/stats/stats.rs b/lib/flash-rs/src/stats/stats.rs index b0351ab..3929cd3 100644 --- a/lib/flash-rs/src/stats/stats.rs +++ b/lib/flash-rs/src/stats/stats.rs @@ -2,14 +2,14 @@ use std::{cell::UnsafeCell, mem}; use crate::{ config::XdpFlags, - fd::{Fd, FdError}, + fd::{FdError, SocketFd}, }; use super::sub::{AppStats, Interface, RingStats, XdpStats}; #[derive(Debug)] pub struct Stats { - fd: Fd, + fd: SocketFd, pub interface: Interface, pub xdp_flags: XdpFlags, pub(crate) ring: UnsafeCell, @@ -20,7 +20,7 @@ unsafe impl Send for Stats {} unsafe impl Sync for Stats {} impl Stats { - pub(crate) fn new(fd: Fd, ifname: String, ifqueue: u32, xdp_flags: XdpFlags) -> Self { + pub(crate) fn new(fd: SocketFd, ifname: String, ifqueue: u32, xdp_flags: XdpFlags) -> Self { Self { fd, interface: Interface { From 68c76ead4d0e9aa43260ab2db424b9d1af44fd67 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Fri, 31 Oct 2025 02:33:52 +0530 Subject: [PATCH 41/43] fix: libxdp installation not working on old ubuntu --- tools/ci_build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_build.sh b/tools/ci_build.sh index 7da4496..f10fb7c 100755 --- a/tools/ci_build.sh +++ b/tools/ci_build.sh @@ -12,6 +12,8 @@ $SUDO apt update $SUDO apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev libnuma-dev git clone https://github.com/xdp-project/xdp-tools.git +make -j -C xdp-tools libxdp +$SUDO make -j -C xdp-tools libxdp_install make PREFIX=/usr -j -C xdp-tools libxdp $SUDO PREFIX=/usr make -j -C xdp-tools libxdp_install From da9fdac8adcbdbeb4ed6e70c37ec3c28b49f2d07 Mon Sep 17 00:00:00 2001 From: Arghyadip Chakraborty Date: Fri, 31 Oct 2025 03:04:03 +0530 Subject: [PATCH 42/43] fix(rust): cli flags --- examples/arpresolver-rs/src/cli.rs | 4 ++-- examples/firewall-rs/src/cli.rs | 4 ++-- examples/ip4ping-rs/src/cli.rs | 4 ++-- examples/l2fwd-rs/src/cli.rs | 4 ++-- examples/maglev-rs/src/cli.rs | 4 ++-- examples/simplefwd-rs/src/cli.rs | 4 ++-- lib/flash-rs/src/config/config_clap.rs | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/arpresolver-rs/src/cli.rs b/examples/arpresolver-rs/src/cli.rs index 00e29e8..ff946a2 100644 --- a/examples/arpresolver-rs/src/cli.rs +++ b/examples/arpresolver-rs/src/cli.rs @@ -45,13 +45,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/examples/firewall-rs/src/cli.rs b/examples/firewall-rs/src/cli.rs index 7123c6d..3c3448a 100644 --- a/examples/firewall-rs/src/cli.rs +++ b/examples/firewall-rs/src/cli.rs @@ -47,13 +47,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/examples/ip4ping-rs/src/cli.rs b/examples/ip4ping-rs/src/cli.rs index 9d5cf63..cfc783e 100644 --- a/examples/ip4ping-rs/src/cli.rs +++ b/examples/ip4ping-rs/src/cli.rs @@ -42,13 +42,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/examples/l2fwd-rs/src/cli.rs b/examples/l2fwd-rs/src/cli.rs index db57cb9..e5de218 100644 --- a/examples/l2fwd-rs/src/cli.rs +++ b/examples/l2fwd-rs/src/cli.rs @@ -42,13 +42,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/examples/maglev-rs/src/cli.rs b/examples/maglev-rs/src/cli.rs index 84451d9..5f77a23 100644 --- a/examples/maglev-rs/src/cli.rs +++ b/examples/maglev-rs/src/cli.rs @@ -47,13 +47,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/examples/simplefwd-rs/src/cli.rs b/examples/simplefwd-rs/src/cli.rs index db57cb9..e5de218 100644 --- a/examples/simplefwd-rs/src/cli.rs +++ b/examples/simplefwd-rs/src/cli.rs @@ -42,13 +42,13 @@ pub struct Cli { pub struct StatsConfig { #[arg( short = 's', - long, + long = "stats-cpu", default_value_t = 1, help = "CPU core index for stats thread" )] pub cpu: usize, - #[arg(short = 'f', long, default_value_t = 1, help = "Tui frames per second")] + #[arg(short = 'F', long, default_value_t = 1, help = "Tui frames per second")] pub fps: u64, #[arg(short = 'l', long, default_value_t = GridLayout::default(), value_parser = GridLayout::from_str, help = "Tui layout")] diff --git a/lib/flash-rs/src/config/config_clap.rs b/lib/flash-rs/src/config/config_clap.rs index eed8ff7..cefb957 100644 --- a/lib/flash-rs/src/config/config_clap.rs +++ b/lib/flash-rs/src/config/config_clap.rs @@ -19,7 +19,7 @@ pub struct FlashConfig { pub(crate) smart_poll: bool, #[arg( - short = 'p', + short = 'P', long, default_value_t = false, help = "Enable periodic sleep mode" From dbe2bd869cf1bec3eea0809b46c0cfc9f95a3948 Mon Sep 17 00:00:00 2001 From: Debojeet Das Date: Fri, 31 Oct 2025 03:23:07 +0530 Subject: [PATCH 43/43] fix: libxdp build not working - libxdp is not building on old ubuntu - quick fix removed ubuntu 22 from ci/cd pipeline --- .github/workflows/main.yml | 2 +- tools/ci_build.sh | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1d9caf0..d68d15c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,7 @@ jobs: build: strategy: matrix: - os: [ubuntu-22.04, ubuntu-latest] + os: [ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/tools/ci_build.sh b/tools/ci_build.sh index f10fb7c..7da4496 100755 --- a/tools/ci_build.sh +++ b/tools/ci_build.sh @@ -12,8 +12,6 @@ $SUDO apt update $SUDO apt install -y build-essential meson libbpf-dev pkg-config git gcc-multilib clang llvm lld m4 libpcap-dev libcjson-dev libncurses-dev libnuma-dev git clone https://github.com/xdp-project/xdp-tools.git -make -j -C xdp-tools libxdp -$SUDO make -j -C xdp-tools libxdp_install make PREFIX=/usr -j -C xdp-tools libxdp $SUDO PREFIX=/usr make -j -C xdp-tools libxdp_install