From 9412d8fa4636637e3e78cca3bbe0f2d0a8c684cf Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 1 Oct 2021 15:27:52 +0200 Subject: [PATCH] Squashed 'src/n-acd/' content from commit a600afc87087 git-subtree-dir: src/n-acd git-subtree-split: a600afc870872bbdfc8081ca68d5665334cb9e6e --- .editorconfig | 11 + .github/workflows/ci.yml | 122 +++++ .gitmodules | 12 + AUTHORS | 39 ++ NEWS.md | 46 ++ README.md | 60 +++ meson.build | 27 + meson_options.txt | 1 + src/libnacd.sym | 28 ++ src/meson.build | 95 ++++ src/n-acd-bpf-fallback.c | 30 ++ src/n-acd-bpf.c | 317 ++++++++++++ src/n-acd-private.h | 154 ++++++ src/n-acd-probe.c | 712 ++++++++++++++++++++++++++ src/n-acd.c | 1027 ++++++++++++++++++++++++++++++++++++++ src/n-acd.h | 150 ++++++ src/test-api.c | 88 ++++ src/test-bpf.c | 226 +++++++++ src/test-loopback.c | 82 +++ src/test-twice.c | 97 ++++ src/test-unplug.c | 84 ++++ src/test-unused.c | 63 +++ src/test-veth.c | 240 +++++++++ src/test.h | 213 ++++++++ src/util/test-timer.c | 177 +++++++ src/util/timer.c | 189 +++++++ src/util/timer.h | 54 ++ subprojects/c-list | 1 + subprojects/c-rbtree | 1 + subprojects/c-siphash | 1 + subprojects/c-stdaux | 1 + 31 files changed, 4348 insertions(+) create mode 100644 .editorconfig create mode 100644 .github/workflows/ci.yml create mode 100644 .gitmodules create mode 100644 AUTHORS create mode 100644 NEWS.md create mode 100644 README.md create mode 100644 meson.build create mode 100644 meson_options.txt create mode 100644 src/libnacd.sym create mode 100644 src/meson.build create mode 100644 src/n-acd-bpf-fallback.c create mode 100644 src/n-acd-bpf.c create mode 100644 src/n-acd-private.h create mode 100644 src/n-acd-probe.c create mode 100644 src/n-acd.c create mode 100644 src/n-acd.h create mode 100644 src/test-api.c create mode 100644 src/test-bpf.c create mode 100644 src/test-loopback.c create mode 100644 src/test-twice.c create mode 100644 src/test-unplug.c create mode 100644 src/test-unused.c create mode 100644 src/test-veth.c create mode 100644 src/test.h create mode 100644 src/util/test-timer.c create mode 100644 src/util/timer.c create mode 100644 src/util/timer.h create mode 160000 subprojects/c-list create mode 160000 subprojects/c-rbtree create mode 160000 subprojects/c-siphash create mode 160000 subprojects/c-stdaux diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000..b10bb4f3f8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 + +[*.{c,h}] +indent_style = space +indent_size = 8 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..22fc814187 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,122 @@ +name: Continuous Integration + +on: + push: + pull_request: + schedule: + - cron: '0 0 * * *' + +jobs: + ci: + name: CI with Default Configuration + runs-on: ubuntu-latest + + steps: + # + # Prepare CI + # + # We cannot use the github-action of the `ci-c-util` project, because we + # need privileges in the container. Therefore, fetch the CI sources and + # build the container manually. + # + - name: Fetch CI + uses: actions/checkout@v2 + with: + repository: c-util/automation + ref: v1 + path: automation + - name: Build CI + working-directory: automation/src/ci-c-util + run: docker build --tag ci-c-util:v1 . + + # + # Run CI + # + # Take the CI image we built and run the CI with the default project + # configuration. We do not use valgrind, since it falls-over with bpf(2) + # syscalls. + # + - name: Fetch Sources + uses: actions/checkout@v2 + with: + path: source + - name: Run through C-Util CI + run: | + docker run \ + --privileged \ + -v "$(pwd)/source:/github/workspace" \ + "ci-c-util:v1" \ + "--m32=1" \ + "--source=/github/workspace" + + ci-no-ebpf: + name: CI without eBPF + runs-on: ubuntu-latest + + steps: + # See above in 'ci' job. + - name: Fetch CI + uses: actions/checkout@v2 + with: + repository: c-util/automation + ref: v1 + path: automation + - name: Build CI + working-directory: automation/src/ci-c-util + run: docker build --tag ci-c-util:v1 . + + # + # Run CI + # + # This again runs the CI, but this time disables eBPF. We do support the + # legacy BPF fallback, so lets make sure we test for it. + # + - name: Fetch Sources + uses: actions/checkout@v2 + with: + path: source + - name: Run through C-Util CI + run: | + docker run \ + --privileged \ + -v "$(pwd)/source:/github/workspace" \ + "ci-c-util:v1" \ + "--m32=1" \ + "--mesonargs=-Debpf=false" \ + "--source=/github/workspace" + + ci-valgrind: + name: CI through Valgrind + runs-on: ubuntu-latest + + steps: + # See above in 'ci' job. + - name: Fetch CI + uses: actions/checkout@v2 + with: + repository: c-util/automation + ref: v1 + path: automation + - name: Build CI + working-directory: automation/src/ci-c-util + run: docker build --tag ci-c-util:v1 . + + # + # Run CI + # + # This again runs the CI, but this time through valgrind. Since some + # syscalls are not implemented on x86-64 32bit compat (e.g., bpf(2)), we + # disable the m32 mode. + # + - name: Fetch Sources + uses: actions/checkout@v2 + with: + path: source + - name: Run through C-Util CI + run: | + docker run \ + --privileged \ + -v "$(pwd)/source:/github/workspace" \ + "ci-c-util:v1" \ + "--source=/github/workspace" \ + "--valgrind=1" diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..04829bdba8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,12 @@ +[submodule "subprojects/c-list"] + path = subprojects/c-list + url = https://github.com/c-util/c-list.git +[submodule "subprojects/c-siphash"] + path = subprojects/c-siphash + url = https://github.com/c-util/c-siphash.git +[submodule "subprojects/c-rbtree"] + path = subprojects/c-rbtree + url = https://github.com/c-util/c-rbtree.git +[submodule "subprojects/c-stdaux"] + path = subprojects/c-stdaux + url = https://github.com/c-util/c-stdaux.git diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000..98ff148232 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,39 @@ +LICENSE: + This project is dual-licensed under both the Apache License, Version + 2.0, and the GNU Lesser General Public License, Version 2.1+. + +AUTHORS-ASL: + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +AUTHORS-LGPL: + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; If not, see . + +COPYRIGHT: (ordered alphabetically) + Copyright (C) 2015-2019 Red Hat, Inc. + +AUTHORS: (ordered alphabetically) + Beniamino Galvani + David Rheinsberg + Thomas Haller + Tom Gundersen diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000000..7a9ccd67c0 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,46 @@ +# n-acd - IPv4 Address Conflict Detection + +## CHANGES WITH 2: + + * All public destructors now include a variant that returns `void`. + This was requested for easier integration with `glib` and friends. + Similar to the `cleanup` variants, these variants are denoted by a + single-character function-name suffix. E.g., `n_acd_freev()` + + * A fallback to `CLOCK_MONOTONIC` is now provided in case + `CLOCK_BOOTTIME` is not supported by the kernel. Note that this is in + no way signalled through the API, so if timers should follow the + `BOOTTIME` rather than monotonic clock, a kernel with this clock is + required. + + * The `c-sundry` dependency is no longer needed. + + * The `transport` configuration property is now mandatory for + `n_acd_new()`. It defaulted to `ETHERNET` before, by mistake. + + * In-source documentation for the public API is now provided. + + Contributions from: Beniamino Galvani, David Herrmann, David + Rheinsberg, Thomas Haller, Tom Gundersen + + - Tübingen, 2019-03-20 + +## CHANGES WITH 1: + + * Initial release of n-acd. This project implements the IPv4 Address + Conflict Detection standard as defined in RFC-5227. The state machine + is implemented in a shared library and provides a stable ISO-C11 API. + The implementation is linux-only and relies heavily on the API + behavior of recent linux kernel releases. + + * Compared to the pre-releases, this release supports many parallel + probes on a single n-acd context. This reduces the number of + allocated network resources to O(1), based on the number of running + parallel probes. + + * The n-acd project is now dual-licensed: ASL-2.0 and LGPL-2.1+ + + Contributions from: Beniamino Galvani, David Herrmann, Thomas Haller, + Tom Gundersen + + - Tübingen, 2018-08-08 diff --git a/README.md b/README.md new file mode 100644 index 0000000000..089541825d --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +n-acd +===== + +IPv4 Address Conflict Detection + +The n-acd project implements the IPv4 Address Conflict Detection standard as +defined in RFC-5227. The state machine is implemented in a shared library and +provides a stable ISO-C11 API. The implementation is linux-only and relies +heavily on the API behavior of recent linux kernel releases. + +### Project + + * **Website**: + * **Bug Tracker**: + * **Mailing-List**: + +### Requirements + +The requirements for this project are: + + * `Linux kernel >= 3.19` + * `libc` (e.g., `glibc >= 2.16`) + +At build-time, the following software is required: + + * `meson >= 0.41` + * `pkg-config >= 0.29` + +### Build + +The meson build-system is used for this project. Contact upstream +documentation for detailed help. In most situations the following +commands are sufficient to build and install from source: + +```sh +mkdir build +cd build +meson setup .. +ninja +meson test +ninja install +``` + +The following configuration options are available: + + * `ebpf`: This boolean controls whether `ebpf` features are used to improve + the package filtering performance. If disabled, classic bpf will be + used. This feature requires a rather recent kernel (>=3.19). + Default is: true + +### Repository: + + - **web**: + - **https**: `https://github.com/nettools/n-acd.git` + - **ssh**: `git@github.com:nettools/n-acd.git` + +### License: + + - **Apache-2.0** OR **LGPL-2.1-or-later** + - See AUTHORS file for details. diff --git a/meson.build b/meson.build new file mode 100644 index 0000000000..017dec5669 --- /dev/null +++ b/meson.build @@ -0,0 +1,27 @@ +project( + 'n-acd', + 'c', + version: '2', + license: 'Apache', + default_options: [ + 'c_std=c11', + ], +) +project_description = 'IPv4 Address Conflict Detection' + +add_project_arguments('-D_GNU_SOURCE', language: 'c') +mod_pkgconfig = import('pkgconfig') + +sub_clist = subproject('c-list') +sub_crbtree = subproject('c-rbtree') +sub_csiphash = subproject('c-siphash') +sub_cstdaux = subproject('c-stdaux') + +dep_clist = sub_clist.get_variable('libclist_dep') +dep_crbtree = sub_crbtree.get_variable('libcrbtree_dep') +dep_csiphash = sub_csiphash.get_variable('libcsiphash_dep') +dep_cstdaux = sub_cstdaux.get_variable('libcstdaux_dep') + +use_ebpf = get_option('ebpf') + +subdir('src') diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000000..b024ee1d4c --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('ebpf', type: 'boolean', value: true, description: 'Enable eBPF packet filtering') diff --git a/src/libnacd.sym b/src/libnacd.sym new file mode 100644 index 0000000000..f85e13acf9 --- /dev/null +++ b/src/libnacd.sym @@ -0,0 +1,28 @@ +LIBNACD_2 { +global: + n_acd_config_new; + n_acd_config_free; + n_acd_config_set_ifindex; + n_acd_config_set_transport; + n_acd_config_set_mac; + + n_acd_probe_config_new; + n_acd_probe_config_free; + n_acd_probe_config_set_ip; + n_acd_probe_config_set_timeout; + + n_acd_new; + n_acd_ref; + n_acd_unref; + n_acd_get_fd; + n_acd_dispatch; + n_acd_pop_event; + n_acd_probe; + + n_acd_probe_free; + n_acd_probe_set_userdata; + n_acd_probe_get_userdata; + n_acd_probe_announce; +local: + *; +}; diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000000..3e92681f91 --- /dev/null +++ b/src/meson.build @@ -0,0 +1,95 @@ +# +# target: libnacd.so +# + +libnacd_symfile = join_paths(meson.current_source_dir(), 'libnacd.sym') + +libnacd_deps = [ + dep_clist, + dep_crbtree, + dep_csiphash, + dep_cstdaux, +] + +libnacd_sources = [ + 'n-acd.c', + 'n-acd-probe.c', + 'util/timer.c', +] + +if use_ebpf + libnacd_sources += [ + 'n-acd-bpf.c', + ] +else + libnacd_sources += [ + 'n-acd-bpf-fallback.c', + ] +endif + +libnacd_private = static_library( + 'nacd-private', + libnacd_sources, + c_args: [ + '-fvisibility=hidden', + '-fno-common' + ], + dependencies: libnacd_deps, + pic: true, +) + +libnacd_shared = shared_library( + 'nacd', + objects: libnacd_private.extract_all_objects(), + dependencies: libnacd_deps, + install: not meson.is_subproject(), + soversion: 0, + link_depends: libnacd_symfile, + link_args: [ + '-Wl,--no-undefined', + '-Wl,--version-script=@0@'.format(libnacd_symfile) + ], +) + +libnacd_dep = declare_dependency( + include_directories: include_directories('.'), + link_with: libnacd_private, + dependencies: libnacd_deps, + version: meson.project_version(), +) + +if not meson.is_subproject() + install_headers('n-acd.h') + + mod_pkgconfig.generate( + libraries: libnacd_shared, + version: meson.project_version(), + name: 'libnacd', + filebase: 'libnacd', + description: project_description, + ) +endif + +# +# target: test-* +# + +test_api = executable('test-api', ['test-api.c'], link_with: libnacd_shared) +test('API Symbol Visibility', test_api) + +if use_ebpf + test_bpf = executable('test-bpf', ['test-bpf.c'], dependencies: libnacd_dep) + test('eBPF socket filtering', test_bpf) +endif + +test_loopback = executable('test-loopback', ['test-loopback.c'], dependencies: libnacd_dep) +test('Echo Suppression via Loopback', test_loopback) + +test_timer = executable('test-timer', ['util/test-timer.c'], dependencies: libnacd_dep) +test('Timer helper', test_timer) + +#test_unplug = executable('test-unplug', ['test-unplug.c'], dependencies: libnacd_dep) +#test('Async Interface Hotplug', test_unplug) + +test_veth = executable('test-veth', ['test-veth.c'], dependencies: libnacd_dep) +test('Parallel ACD instances', test_veth) diff --git a/src/n-acd-bpf-fallback.c b/src/n-acd-bpf-fallback.c new file mode 100644 index 0000000000..3cf4eb0679 --- /dev/null +++ b/src/n-acd-bpf-fallback.c @@ -0,0 +1,30 @@ +/* + * A noop implementation of eBPF filter for IPv4 Address Conflict Detection + * + * These are a collection of dummy functions that have no effect, but allows + * n-acd to compile without eBPF support. + * + * See n-acd-bpf.c for documentation. + */ + +#include +#include +#include "n-acd-private.h" + +int n_acd_bpf_map_create(int *mapfdp, size_t max_entries) { + *mapfdp = -1; + return 0; +} + +int n_acd_bpf_map_add(int mapfd, struct in_addr *addrp) { + return 0; +} + +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addrp) { + return 0; +} + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *macp) { + *progfdp = -1; + return 0; +} diff --git a/src/n-acd-bpf.c b/src/n-acd-bpf.c new file mode 100644 index 0000000000..57b29ddf2f --- /dev/null +++ b/src/n-acd-bpf.c @@ -0,0 +1,317 @@ +/* + * eBPF filter for IPv4 Address Conflict Detection + * + * An eBPF map and an eBPF program are provided. The map contains all the + * addresses address conflict detection is performed on, and the program + * filters out all packets except exactly the packets relevant to the ACD + * protocol on the addresses currently in the map. + * + * Note that userspace still has to filter the incoming packets, as filter + * are applied when packets are queued on the socket, not when userspace + * receives them. It is therefore possible to receive packets about addresses + * that have already been removed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd-private.h" + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = BPF_PSEUDO_MAP_FD, \ + .off = 0, \ + .imm = (__u32) (MAP_FD), \ + }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (MAP_FD)) >> 32, \ + }) + +#define BPF_ALU_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0, \ + }) + +#define BPF_ALU_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_MOV_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0, \ + }) + +#define BPF_MOV_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM, \ + }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = FUNC, \ + }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0, \ + }) + +static int n_acd_syscall_bpf(int cmd, union bpf_attr *attr, unsigned int size) { + return (int)syscall(__NR_bpf, cmd, attr, size); +} + +int n_acd_bpf_map_create(int *mapfdp, size_t max_entries) { + union bpf_attr attr; + int mapfd; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(uint32_t), + .value_size = sizeof(uint8_t), /* values are never used, but must be set */ + .max_entries = max_entries, + }; + + mapfd = n_acd_syscall_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); + if (mapfd < 0) + return -errno; + + *mapfdp = mapfd; + return 0; +} + +int n_acd_bpf_map_add(int mapfd, struct in_addr *addrp) { + union bpf_attr attr; + uint32_t addr = be32toh(addrp->s_addr); + uint8_t _dummy = 0; + int r; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_fd = mapfd, + .key = (uint64_t)(unsigned long)&addr, + .value = (uint64_t)(unsigned long)&_dummy, + .flags = BPF_NOEXIST, + }; + + r = n_acd_syscall_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + if (r < 0) + return -errno; + + return 0; +} + +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addrp) { + uint32_t addr = be32toh(addrp->s_addr); + union bpf_attr attr; + int r; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_fd = mapfd, + .key = (uint64_t)(unsigned long)&addr, + }; + + r = n_acd_syscall_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + if (r < 0) + return -errno; + + return 0; +} + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *macp) { + const union { + uint8_t u8[6]; + uint16_t u16[3]; + uint32_t u32[1]; + } mac = { + .u8 = { + macp->ether_addr_octet[0], + macp->ether_addr_octet[1], + macp->ether_addr_octet[2], + macp->ether_addr_octet[3], + macp->ether_addr_octet[4], + macp->ether_addr_octet[5], + }, + }; + struct bpf_insn prog[] = { + /* for using BPF_LD_ABS r6 must point to the skb, currently in r1 */ + BPF_MOV_REG(6, 1), /* r6 = r1 */ + + /* drop the packet if it is too short */ + BPF_LDX_MEM(BPF_W, 0, 6, offsetof(struct __sk_buff, len)), /* r0 = skb->len */ + BPF_JMP_IMM(BPF_JGE, 0, sizeof(struct ether_arp), 2), /* if (r0 >= sizeof(ether_arp)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* drop the packet if the header is not as expected */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_hrd)), /* r0 = header type */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPHRD_ETHER, 2), /* if (r0 == ethernet) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_pro)), /* r0 = protocol */ + BPF_JMP_IMM(BPF_JEQ, 0, ETHERTYPE_IP, 2), /* if (r0 == IP) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_B, offsetof(struct ether_arp, arp_hln)), /* r0 = hw addr length */ + BPF_JMP_IMM(BPF_JEQ, 0, sizeof(struct ether_addr), 2), /* if (r0 == sizeof(ether_addr)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_B, offsetof(struct ether_arp, arp_pln)), /* r0 = protocol addr length */ + BPF_JMP_IMM(BPF_JEQ, 0, sizeof(struct in_addr), 2), /* if (r0 == sizeof(in_addr)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* drop packets from our own mac address */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_sha)), /* r0 = first four bytes of packet mac address */ + BPF_JMP_IMM(BPF_JNE, 0, be32toh(mac.u32[0]), 4), /* if (r0 != first four bytes of our mac address) skip 4 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_sha) + 4), /* r0 = last two bytes of packet mac address */ + BPF_JMP_IMM(BPF_JNE, 0, be16toh(mac.u16[2]), 2), /* if (r0 != last two bytes of our mac address) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* + * We listen for two kinds of packets: + * Conflicts) + * These are requests or replies with the sender address not set to INADDR_ANY. The + * conflicted address is the sender address, remember this in r7. + * Probes) + * These are requests with the sender address set to INADDR_ANY. The probed address + * is the target address, remember this in r7. + * Any other packets are dropped. + */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_spa)), /* r0 = sender ip address */ + BPF_JMP_IMM(BPF_JEQ, 0, 0, 7), /* if (r0 == 0) skip 7 */ + BPF_MOV_REG(7, 0), /* r7 = r0 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_op)), /* r0 = operation */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REQUEST, 3), /* if (r0 == request) skip 3 */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REPLY, 2), /* if (r0 == reply) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + BPF_JMP_IMM(BPF_JA, 0, 0, 6), /* skip 6 */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_tpa)), /* r0 = target ip address */ + BPF_MOV_REG(7, 0), /* r7 = r0 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_op)), /* r0 = operation */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REQUEST, 2), /* if (r0 == request) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* check if the probe or conflict is for an address we are monitoring */ + BPF_STX_MEM(BPF_W, 10, 7, -4), /* *(uint32_t*)fp - 4 = r7 */ + BPF_MOV_REG(2, 10), /* r2 = fp */ + BPF_ALU_IMM(BPF_ADD, 2, -4), /* r2 -= 4 */ + BPF_LD_MAP_FD(1, mapfd), /* r1 = mapfd */ + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), /* r0 = map_lookup_elem(r1, r2) */ + BPF_JMP_IMM(BPF_JNE, 0, 0, 2), /* if (r0 != NULL) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* return exactly the packet length*/ + BPF_MOV_IMM(0, sizeof(struct ether_arp)), /* r0 = sizeof(struct ether_arp) */ + BPF_EXIT_INSN(), /* return */ + }; + union bpf_attr attr; + int progfd; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .insns = (uint64_t)(unsigned long)prog, + .insn_cnt = sizeof(prog) / sizeof(*prog), + .license = (uint64_t)(unsigned long)"ASL", + }; + + progfd = n_acd_syscall_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (progfd < 0) + return -errno; + + *progfdp = progfd; + return 0; +} diff --git a/src/n-acd-private.h b/src/n-acd-private.h new file mode 100644 index 0000000000..4583c018e2 --- /dev/null +++ b/src/n-acd-private.h @@ -0,0 +1,154 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/timer.h" +#include "n-acd.h" + +typedef struct NAcdEventNode NAcdEventNode; + +/* This augments the error-codes with internal ones that are never exposed. */ +enum { + _N_ACD_INTERNAL = _N_ACD_E_N, + + N_ACD_E_DROPPED, +}; + +enum { + N_ACD_PROBE_STATE_PROBING, + N_ACD_PROBE_STATE_CONFIGURING, + N_ACD_PROBE_STATE_ANNOUNCING, + N_ACD_PROBE_STATE_FAILED, +}; + +struct NAcdConfig { + int ifindex; + unsigned int transport; + uint8_t mac[ETH_ALEN]; + size_t n_mac; +}; + +#define N_ACD_CONFIG_NULL(_x) { \ + .transport = _N_ACD_TRANSPORT_N, \ + } + +struct NAcdProbeConfig { + struct in_addr ip; + uint64_t timeout_msecs; +}; + +#define N_ACD_PROBE_CONFIG_NULL(_x) { \ + .timeout_msecs = N_ACD_TIMEOUT_RFC5227, \ + } + +struct NAcdEventNode { + CList acd_link; + CList probe_link; + NAcdEvent event; + uint8_t sender[ETH_ALEN]; + bool is_public : 1; +}; + +#define N_ACD_EVENT_NODE_NULL(_x) { \ + .acd_link = C_LIST_INIT((_x).acd_link), \ + .probe_link = C_LIST_INIT((_x).probe_link), \ + } + +struct NAcd { + unsigned long n_refs; + unsigned int seed; + int fd_epoll; + int fd_socket; + CRBTree ip_tree; + CList event_list; + Timer timer; + + /* BPF map */ + int fd_bpf_map; + size_t n_bpf_map; + size_t max_bpf_map; + + /* configuration */ + int ifindex; + uint8_t mac[ETH_ALEN]; + + /* flags */ + bool preempted : 1; +}; + +#define N_ACD_NULL(_x) { \ + .n_refs = 1, \ + .fd_epoll = -1, \ + .fd_socket = -1, \ + .ip_tree = C_RBTREE_INIT, \ + .event_list = C_LIST_INIT((_x).event_list), \ + .timer = TIMER_NULL((_x).timer), \ + .fd_bpf_map = -1, \ + } + +struct NAcdProbe { + NAcd *acd; + CRBNode ip_node; + CList event_list; + Timeout timeout; + + /* configuration */ + struct in_addr ip; + uint64_t timeout_multiplier; + void *userdata; + + /* state */ + unsigned int state; + unsigned int n_iteration; + unsigned int defend; + uint64_t last_defend; +}; + +#define N_ACD_PROBE_NULL(_x) { \ + .ip_node = C_RBNODE_INIT((_x).ip_node), \ + .event_list = C_LIST_INIT((_x).event_list), \ + .timeout = TIMEOUT_INIT((_x).timeout), \ + .state = N_ACD_PROBE_STATE_PROBING, \ + .defend = N_ACD_DEFEND_NEVER, \ + } + +/* events */ + +int n_acd_event_node_new(NAcdEventNode **nodep); +NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node); + +/* contexts */ + +void n_acd_remember(NAcd *acd, uint64_t now, bool success); +int n_acd_raise(NAcd *acd, NAcdEventNode **nodep, unsigned int event); +int n_acd_send(NAcd *acd, const struct in_addr *tpa, const struct in_addr *spa); +int n_acd_ensure_bpf_map_space(NAcd *acd); + +/* probes */ + +int n_acd_probe_new(NAcdProbe **probep, NAcd *acd, NAcdProbeConfig *config); +int n_acd_probe_raise(NAcdProbe *probe, NAcdEventNode **nodep, unsigned int event); +int n_acd_probe_handle_timeout(NAcdProbe *probe); +int n_acd_probe_handle_packet(NAcdProbe *probe, struct ether_arp *packet, bool hard_conflict); + +/* eBPF */ + +int n_acd_bpf_map_create(int *mapfdp, size_t max_elements); +int n_acd_bpf_map_add(int mapfd, struct in_addr *addr); +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addr); + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *mac); + +/* inline helpers */ + +static inline void n_acd_event_node_freep(NAcdEventNode **node) { + if (*node) + n_acd_event_node_free(*node); +} diff --git a/src/n-acd-probe.c b/src/n-acd-probe.c new file mode 100644 index 0000000000..c1ed59ae9e --- /dev/null +++ b/src/n-acd-probe.c @@ -0,0 +1,712 @@ +/* + * IPv4 Address Conflict Detection + * + * This file implements the probe object. A probe is basically the + * state-machine of a single ACD run. It takes an address to probe for, checks + * for conflicts and then defends it once configured. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" +#include "n-acd-private.h" + +/* + * These parameters and timing intervals are specified in RFC-5227. The + * original values are: + * + * PROBE_NUM 3 + * PROBE_WAIT 1s + * PROBE_MIN 1s + * PROBE_MAX 3s + * ANNOUNCE_NUM 3 + * ANNOUNCE_WAIT 2s + * ANNOUNCE_INTERVAL 2s + * MAX_CONFLICTS 10 + * RATE_LIMIT_INTERVAL 60s + * DEFEND_INTERVAL 10s + * + * If we assume a best-case and worst-case scenario for non-conflicted runs, we + * end up with a runtime between 4s and 9s to finish the probe. Then it still + * takes a fixed 4s to finish the announcements. + * + * RFC 5227 section 1.1: + * [...] (Note that the values listed here are fixed constants; they are + * not intended to be modifiable by implementers, operators, or end users. + * These constants are given symbolic names here to facilitate the writing + * of future standards that may want to reference this document with + * different values for these named constants; however, at the present time + * no such future standards exist.) [...] + * + * Unfortunately, no-one ever stepped up to write a "future standard" to revise + * the timings. A 9s timeout for successful link setups is not acceptable today. + * Hence, we will just go forward and ignore the proposed values. On both + * wired and wireless local links round-trip latencies of below 3ms are common. + * We require the caller to set a timeout multiplier, where 1 corresponds to a + * total probe time between 0.5 ms and 1.0 ms. On modern networks a multiplier + * of about 100 should be a reasonable default. To comply with the RFC select a + * multiplier of 9000. + */ +#define N_ACD_RFC_PROBE_NUM (3) +#define N_ACD_RFC_PROBE_WAIT_NSEC (UINT64_C(111111)) /* 1/9 ms */ +#define N_ACD_RFC_PROBE_MIN_NSEC (UINT64_C(111111)) /* 1/9 ms */ +#define N_ACD_RFC_PROBE_MAX_NSEC (UINT64_C(333333)) /* 3/9 ms */ +#define N_ACD_RFC_ANNOUNCE_NUM (3) +#define N_ACD_RFC_ANNOUNCE_WAIT_NSEC (UINT64_C(222222)) /* 2/9 ms */ +#define N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC (UINT64_C(222222)) /* 2/9 ms */ +#define N_ACD_RFC_MAX_CONFLICTS (10) +#define N_ACD_RFC_RATE_LIMIT_INTERVAL_NSEC (UINT64_C(60000000000)) /* 60s */ +#define N_ACD_RFC_DEFEND_INTERVAL_NSEC (UINT64_C(10000000000)) /* 10s */ + +/** + * n_acd_probe_config_new() - create probe configuration + * @configp: output argument for new probe configuration + * + * This creates a new probe configuration. It will be returned in @configp to + * the caller, which upon return fully owns the object. + * + * A probe configuration collects parameters for probes. It never validates the + * input, but this is left to the consumer of the configuration to do. + * + * Return: 0 on success, negative error code on failure. + */ +_c_public_ int n_acd_probe_config_new(NAcdProbeConfig **configp) { + _c_cleanup_(n_acd_probe_config_freep) NAcdProbeConfig *config = NULL; + + config = malloc(sizeof(*config)); + if (!config) + return -ENOMEM; + + *config = (NAcdProbeConfig)N_ACD_PROBE_CONFIG_NULL(*config); + + *configp = config; + config = NULL; + return 0; +} + +/** + * n_acd_probe_config_free() - destroy probe configuration + * @config: configuration to operate on, or NULL + * + * This destroys the probe configuration and all associated objects. If @config + * is NULL, this is a no-op. + * + * Return: NULL is returned. + */ +_c_public_ NAcdProbeConfig *n_acd_probe_config_free(NAcdProbeConfig *config) { + if (!config) + return NULL; + + free(config); + + return NULL; +} + +/** + * n_acd_probe_config_set_ip() - set ip property + * @config: configuration to operate on + * @ip: ip to set + * + * This sets the IP property to the value `ip`. The address is copied into the + * configuration object. No validation is performed. + * + * The IP property selects the IP address that a probe checks for. It is the + * caller's responsibility to guarantee the address is valid and can be used. + */ +_c_public_ void n_acd_probe_config_set_ip(NAcdProbeConfig *config, struct in_addr ip) { + config->ip = ip; +} + +/** + * n_acd_probe_config_set_timeout() - set timeout property + * @config: configuration to operate on + * @msecs: timeout to set, in milliseconds + * + * This sets the timeout to use for a conflict detection probe. The + * specification default is provided as `N_ACD_TIMEOUT_RFC5227` and corresponds + * to 9 seconds. + * + * If set to 0, conflict detection is skipped and the address is immediately + * advertised and defended. + * + * Depending on the transport used, the API user should select a suitable + * timeout. Since `ACD` only operates on the link layer, timeouts in the + * hundreds of milliseconds range should be more than enough for any modern + * network. Note that increasing this value directly affects the time it takes + * to connect to a network, since an address should not be used unless conflict + * detection finishes. + * + * Using the specification default is **discouraged**. It is way too slow and + * not appropriate for modern networks. + * + * Default value is `N_ACD_TIMEOUT_RFC5227`. + */ +_c_public_ void n_acd_probe_config_set_timeout(NAcdProbeConfig *config, uint64_t msecs) { + config->timeout_msecs = msecs; +} + +static void n_acd_probe_schedule(NAcdProbe *probe, uint64_t n_timeout, unsigned int n_jitter) { + uint64_t n_time; + + timer_now(&probe->acd->timer, &n_time); + n_time += n_timeout; + + /* + * ACD specifies jitter values to reduce packet storms on the local + * link. This call accepts the maximum relative jitter value in + * nanoseconds as @n_jitter. We then use rand_r(3p) to get a + * pseudo-random jitter on top of the real timeout given as @n_timeout. + */ + if (n_jitter) { + uint64_t random; + + random = ((uint64_t)rand_r(&probe->acd->seed) << 32) | (uint64_t)rand_r(&probe->acd->seed); + n_time += random % n_jitter; + } + + timeout_schedule(&probe->timeout, &probe->acd->timer, n_time); +} + +static void n_acd_probe_unschedule(NAcdProbe *probe) { + timeout_unschedule(&probe->timeout); +} + +static bool n_acd_probe_is_unique(NAcdProbe *probe) { + NAcdProbe *sibling; + + if (!c_rbnode_is_linked(&probe->ip_node)) + return false; + + sibling = c_rbnode_entry(c_rbnode_next(&probe->ip_node), NAcdProbe, ip_node); + if (sibling && sibling->ip.s_addr == probe->ip.s_addr) + return false; + + sibling = c_rbnode_entry(c_rbnode_prev(&probe->ip_node), NAcdProbe, ip_node); + if (sibling && sibling->ip.s_addr == probe->ip.s_addr) + return false; + + return true; +} + +static int n_acd_probe_link(NAcdProbe *probe) { + int r; + + /* + * Make sure the kernel bpf map has space for at least one more + * entry. + */ + r = n_acd_ensure_bpf_map_space(probe->acd); + if (r) + return r; + + /* + * Link entry into context, indexed by its IP. Note that we allow + * duplicates just fine. It is up to you to decide whether to avoid + * duplicates, if you don't want them. Duplicates on the same context + * do not conflict with each other, though. + */ + { + CRBNode **slot, *parent; + NAcdProbe *other; + + slot = &probe->acd->ip_tree.root; + parent = NULL; + while (*slot) { + other = c_rbnode_entry(*slot, NAcdProbe, ip_node); + parent = *slot; + if (probe->ip.s_addr < other->ip.s_addr) + slot = &(*slot)->left; + else + slot = &(*slot)->right; + } + + c_rbtree_add(&probe->acd->ip_tree, parent, slot, &probe->ip_node); + } + + /* + * Add the ip address to the map, if it is not already there. + */ + if (n_acd_probe_is_unique(probe)) { + r = n_acd_bpf_map_add(probe->acd->fd_bpf_map, &probe->ip); + if (r) { + /* + * Make sure the IP address is linked in userspace iff + * it is linked in the kernel. + */ + c_rbnode_unlink(&probe->ip_node); + return r; + } + ++probe->acd->n_bpf_map; + } + + return 0; +} + +static void n_acd_probe_unlink(NAcdProbe *probe) { + int r; + + /* + * If this is the only probe for a given IP, remove the IP from the + * kernel BPF map. + */ + if (n_acd_probe_is_unique(probe)) { + r = n_acd_bpf_map_remove(probe->acd->fd_bpf_map, &probe->ip); + c_assert(r >= 0); + --probe->acd->n_bpf_map; + } + c_rbnode_unlink(&probe->ip_node); +} + +int n_acd_probe_new(NAcdProbe **probep, NAcd *acd, NAcdProbeConfig *config) { + _c_cleanup_(n_acd_probe_freep) NAcdProbe *probe = NULL; + int r; + + if (!config->ip.s_addr) + return N_ACD_E_INVALID_ARGUMENT; + + probe = malloc(sizeof(*probe)); + if (!probe) + return -ENOMEM; + + *probe = (NAcdProbe)N_ACD_PROBE_NULL(*probe); + probe->acd = n_acd_ref(acd); + probe->ip = config->ip; + + /* + * We use the provided timeout-length as multiplier for all our + * timeouts. The provided timeout defines the maximum length of an + * entire probe-interval until the first announcement. Given the + * spec-provided parameters, this ends up as: + * + * PROBE_WAIT + PROBE_MAX + PROBE_MAX + ANNOUNCE_WAIT + * = 1s + 3s + 3s + 2s + * = 9s + * + * Hence, the default value for this timeout is 9000ms, which just + * ends up matching the spec-provided values. + * + * What we now semantically do is divide this timeout by 1ns/1000000. + * This first turns it into nanoseconds, then strips the unit by + * turning it into a multiplier. However, rather than performing the + * division here, we multiplier all our timeouts by 1000000 statically + * at compile time. Therefore, we can use the user-provided timeout as + * unmodified multiplier. No conversion necessary. + */ + probe->timeout_multiplier = config->timeout_msecs; + + r = n_acd_probe_link(probe); + if (r) + return r; + + /* + * Now that everything is set up, we have to send the first probe. This + * is done after ~PROBE_WAIT seconds, hence we schedule our timer. + * In case no timeout-multiplier is set, we pretend we already sent all + * probes successfully and schedule the timer so we proceed with the + * announcements. We must schedule a fake timer there, since we are not + * allowed to advance the state machine outside of n_acd_dispatch(). + */ + if (probe->timeout_multiplier) { + probe->n_iteration = 0; + n_acd_probe_schedule(probe, + 0, + probe->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_NSEC); + } else { + probe->n_iteration = N_ACD_RFC_PROBE_NUM; + n_acd_probe_schedule(probe, 0, 0); + } + + *probep = probe; + probe = NULL; + return 0; +} + +/** + * n_acd_probe_free() - destroy a probe + * @probe: probe to operate on, or NULL + * + * This destroys the probe specified by @probe. All operations are immediately + * ceded and all associated objects are released. + * + * If @probe is NULL, this is a no-op. + * + * This function will flush all events associated with @probe from the event + * queue. That is, no events will be returned for this @probe anymore. + * + * Return: NULL is returned. + */ +_c_public_ NAcdProbe *n_acd_probe_free(NAcdProbe *probe) { + NAcdEventNode *node, *t_node; + + if (!probe) + return NULL; + + c_list_for_each_entry_safe(node, t_node, &probe->event_list, probe_link) + n_acd_event_node_free(node); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->acd = n_acd_unref(probe->acd); + free(probe); + + return NULL; +} + +int n_acd_probe_raise(NAcdProbe *probe, NAcdEventNode **nodep, unsigned int event) { + _c_cleanup_(n_acd_event_node_freep) NAcdEventNode *node = NULL; + int r; + + r = n_acd_raise(probe->acd, &node, event); + if (r) + return r; + + switch (event) { + case N_ACD_EVENT_READY: + node->event.ready.probe = probe; + break; + case N_ACD_EVENT_USED: + node->event.used.probe = probe; + break; + case N_ACD_EVENT_DEFENDED: + node->event.defended.probe = probe; + break; + case N_ACD_EVENT_CONFLICT: + node->event.conflict.probe = probe; + break; + default: + c_assert(0); + return -ENOTRECOVERABLE; + } + + c_list_link_tail(&probe->event_list, &node->probe_link); + + if (nodep) + *nodep = node; + node = NULL; + return 0; +} + +int n_acd_probe_handle_timeout(NAcdProbe *probe) { + int r; + + switch (probe->state) { + case N_ACD_PROBE_STATE_PROBING: + /* + * We are still PROBING. We send 3 probes with a random timeout + * scheduled between each. If, after a fixed timeout, we did + * not receive any conflict we consider the probing successful. + */ + if (probe->n_iteration < N_ACD_RFC_PROBE_NUM) { + /* + * We have not sent all 3 probes, yet. A timer fired, + * so we are ready to send the next probe. If this is + * the third probe, schedule a timer for ANNOUNCE_WAIT + * to give other peers a chance to answer. If this is + * not the third probe, wait between PROBE_MIN and + * PROBE_MAX for the next probe. + */ + + r = n_acd_send(probe->acd, &probe->ip, NULL); + if (r) { + if (r != N_ACD_E_DROPPED) + return r; + + /* + * Packet was dropped, and we know about it. It + * never reached the network. Reasons are + * manifold, and n_acd_send() raises events if + * necessary. + * From a probe-perspective, we simply pretend + * we never sent the probe and schedule a + * timeout for the next probe, effectively + * doubling a single probe-interval. + */ + } else { + /* Successfully sent, so advance counter. */ + ++probe->n_iteration; + } + + if (probe->n_iteration < N_ACD_RFC_PROBE_NUM) + n_acd_probe_schedule(probe, + probe->timeout_multiplier * N_ACD_RFC_PROBE_MIN_NSEC, + probe->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_NSEC - N_ACD_RFC_PROBE_MIN_NSEC)); + else + n_acd_probe_schedule(probe, + probe->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_NSEC, + 0); + } else { + /* + * All 3 probes succeeded and we waited enough to + * consider this address usable by now. Do not announce + * the address, yet. We must first give the caller a + * chance to configure the address (so they can answer + * ARP requests), before announcing it. + */ + r = n_acd_probe_raise(probe, NULL, N_ACD_EVENT_READY); + if (r) + return r; + + probe->state = N_ACD_PROBE_STATE_CONFIGURING; + } + + break; + + case N_ACD_PROBE_STATE_ANNOUNCING: + /* + * We are ANNOUNCING, meaning the caller configured the address + * on the interface and is actively using it. We send 3 + * announcements out, in a short interval, and then just + * perform passive conflict detection. + * Note that once all 3 announcements are sent, we no longer + * schedule a timer, so this part should not trigger, anymore. + */ + + r = n_acd_send(probe->acd, &probe->ip, &probe->ip); + if (r) { + if (r != N_ACD_E_DROPPED) + return r; + + /* + * See above in STATE_PROBING for details. We know the + * packet was never sent, so we simply try again after + * extending the timer. + */ + } else { + /* Successfully sent, so advance counter. */ + ++probe->n_iteration; + } + + if (probe->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) { + /* + * Announcements are always scheduled according to the + * time-intervals specified in the spec. We always use + * the RFC5227-mandated multiplier. + * If you reconsider this, note that timeout_multiplier + * might be 0 here. + */ + n_acd_probe_schedule(probe, + N_ACD_TIMEOUT_RFC5227 * N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC, + 0); + } + + break; + + case N_ACD_PROBE_STATE_CONFIGURING: + case N_ACD_PROBE_STATE_FAILED: + default: + /* + * There are no timeouts in these states. If we trigger one, + * something is fishy. + */ + c_assert(0); + return -ENOTRECOVERABLE; + } + + return 0; +} + +int n_acd_probe_handle_packet(NAcdProbe *probe, struct ether_arp *packet, bool hard_conflict) { + NAcdEventNode *node; + uint64_t now; + int r; + + timer_now(&probe->acd->timer, &now); + + switch (probe->state) { + case N_ACD_PROBE_STATE_PROBING: + /* + * Regardless whether this is a hard or soft conflict, we must + * treat this as a probe failure. That is, notify the caller of + * the conflict and wait for further instructions. We do not + * react to this, until the caller tells us what to do, but we + * do stop sending further probes. + */ + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_USED); + if (r) + return r; + + node->event.used.sender = node->sender; + node->event.used.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->state = N_ACD_PROBE_STATE_FAILED; + + break; + + case N_ACD_PROBE_STATE_CONFIGURING: + /* + * We are waiting for the caller to configure the interface and + * start ANNOUNCING. In this state, we cannot defend the + * address as that would indicate that it is ready to be used, + * and we cannot signal CONFLICT or USED as the caller may + * already have started to use the address (and may have + * configured the engine to always defend it, which means they + * should be able to rely on never losing it after READY). + * Simply drop the event, and rely on the anticipated ANNOUNCE + * to trigger it again. + */ + + break; + + case N_ACD_PROBE_STATE_ANNOUNCING: { + /* + * We were already instructed to announce the address, which + * means the address is configured and in use. Hence, the + * caller is responsible to serve regular ARP queries. Meaning, + * we can ignore any soft conflicts (other peers doing ACD). + * + * But if we see a hard-conflict, we either defend the address + * according to the caller's instructions, or we report the + * conflict and bail out. + */ + bool conflict = false, rate_limited = false; + + if (!hard_conflict) + break; + + rate_limited = now < probe->last_defend + N_ACD_RFC_DEFEND_INTERVAL_NSEC; + + switch (probe->defend) { + case N_ACD_DEFEND_NEVER: + conflict = true; + break; + case N_ACD_DEFEND_ONCE: + if (rate_limited) { + conflict = true; + break; + } + + /* fallthrough */ + case N_ACD_DEFEND_ALWAYS: + if (!rate_limited) { + r = n_acd_send(probe->acd, &probe->ip, &probe->ip); + if (r) { + if (r != N_ACD_E_DROPPED) + return r; + + if (probe->defend == N_ACD_DEFEND_ONCE) { + conflict = true; + break; + } + } + + if (r != N_ACD_E_DROPPED) + probe->last_defend = now; + } + + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_DEFENDED); + if (r) + return r; + + node->event.defended.sender = node->sender; + node->event.defended.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + break; + } + + if (conflict) { + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_CONFLICT); + if (r) + return r; + + node->event.conflict.sender = node->sender; + node->event.conflict.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->state = N_ACD_PROBE_STATE_FAILED; + } + + break; + } + + case N_ACD_PROBE_STATE_FAILED: + default: + /* + * We are not listening for packets in these states. If we receive one, + * something is fishy. + */ + c_assert(0); + return -ENOTRECOVERABLE; + } + + return 0; +} + +/** + * n_acd_probe_set_userdata - set userdata + * @probe: probe to operate on + * @userdata: userdata pointer + * + * This can be used to set a caller-controlled user-data pointer on @probe. The + * value of the pointer is never inspected or used by `n-acd` and is fully + * under control of the caller. + * + * The default value is NULL. + */ +_c_public_ void n_acd_probe_set_userdata(NAcdProbe *probe, void *userdata) { + probe->userdata = userdata; +} + +/** + * n_acd_probe_get_userdata - get userdata + * @probe: probe to operate on + * + * This queries the userdata pointer that was previously set through + * n_acd_probe_set_userdata(). + * + * The default value is NULL. + * + * Return: The stored userdata pointer is returned. + */ +_c_public_ void n_acd_probe_get_userdata(NAcdProbe *probe, void **userdatap) { + *userdatap = probe->userdata; +} + +/** + * n_acd_probe_announce() - announce the configured IP address + * @probe: probe to operate on + * @defend: defense policy + * + * Announce the IP address on the local link, and start defending it according + * to the given policy, which mut be one of N_ACD_DEFEND_ONCE, + * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS. + * + * This must be called in response to an N_ACD_EVENT_READY event, and only + * after the given address has been configured on the given network interface. + * + * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defense policy + * is invalid, negative error code on failure. + */ +_c_public_ int n_acd_probe_announce(NAcdProbe *probe, unsigned int defend) { + if (defend >= _N_ACD_DEFEND_N) + return N_ACD_E_INVALID_ARGUMENT; + + probe->state = N_ACD_PROBE_STATE_ANNOUNCING; + probe->defend = defend; + probe->n_iteration = 0; + + /* + * We must schedule a fake-timeout, since we are not allowed to + * advance the state-machine outside of n_acd_dispatch(). + */ + n_acd_probe_schedule(probe, 0, 0); + + return 0; +} diff --git a/src/n-acd.c b/src/n-acd.c new file mode 100644 index 0000000000..c1d9286503 --- /dev/null +++ b/src/n-acd.c @@ -0,0 +1,1027 @@ +/* + * IPv4 Address Conflict Detection + * + * This file contains the main context initialization and management functions, + * as well as a bunch of utilities used through the n-acd modules. + */ + +/** + * DOC: IPv4 Address Conflict Detection + * + * The `n-acd` project implements the IPv4 Address Conflict Detection protocol + * as defined in RFC-5227. The protocol originates in the IPv4 Link Local + * Address selection but was later on generalized and resulted in `ACD`. The + * idea is to use `ARP` to query a link for an address to see whether it + * already exists on the network, as well as defending an address that is in + * use on a network interface. Furthermore, `ACD` provides passive diagnostics + * for administrators, as it will detect address conflicts automatically, which + * then can be logged or shown to a user. + * + * The main context object of `n-acd` is the `NAcd` structure. It is a passive + * ref-counted context object which drives `ACD` probes running on it. A + * context is specific to a linux network device and transport. If multiple + * network devices are used, then separate `NAcd` contexts must be deployed. + * + * The `NAcdProbe` object drives a single `ACD` state-machine. A probe is + * created on an `NAcd` context by providing an address to probe for. The probe + * will then raise notifications whether the address conflict detection found + * something, or whether the address is ready to be used. Optionally, the probe + * will then enter into passive mode and defend the address as long as it is + * kept active. + * + * Note that the `n-acd` project only implements the networking protocol. It + * never queries or modifies network interfaces. It completely relies on the + * API user to react to notifications and update network interfaces + * respectively. `n-acd` uses an event-mechanism on every context object. All + * events raise by any probe or operation on a given context will queue all + * events on that context object. The event-queue can then be drained by the + * API user. All events are properly asynchronous and designed in a way that no + * synchronous reaction to any event is required. That is, the events are + * carefully designed to allow forwarding via IPC (or even networks) to a + * controller that handles them and specifies how to react. Furthermore, none + * of the function calls of `n-acd` require synchronous error handling. + * Instead, functions only ever return values on fatal errors. Everything else + * is queued as events, thus guaranteeing that synchronous handling of return + * values is not required. Exceptions are functions that do not affect internal + * state or do not have an associated context object. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" +#include "n-acd-private.h" + +enum { + N_ACD_EPOLL_TIMER, + N_ACD_EPOLL_SOCKET, +}; + +static int n_acd_get_random(unsigned int *random) { + uint8_t hash_seed[] = { + 0x3a, 0x0c, 0xa6, 0xdd, 0x44, 0xef, 0x5f, 0x7a, + 0x5e, 0xd7, 0x25, 0x37, 0xbf, 0x4e, 0x80, 0xa1, + }; + CSipHash hash = C_SIPHASH_NULL; + struct timespec ts; + const uint8_t *p; + int r; + + /* + * We need random jitter for all timeouts when handling ARP probes. Use + * AT_RANDOM to get a seed for rand_r(3p), if available (should always + * be available on linux). See the time-out scheduler for details. + * Additionally, we include the current time in the seed. This avoids + * using the same jitter in case you run multiple ACD engines in the + * same process. Lastly, the seed is hashed with SipHash24 to avoid + * exposing the value of AT_RANDOM on the network. + */ + c_siphash_init(&hash, hash_seed); + + p = (const uint8_t *)getauxval(AT_RANDOM); + if (p) + c_siphash_append(&hash, p, 16); + + r = clock_gettime(CLOCK_MONOTONIC, &ts); + if (r < 0) + return -c_errno(); + + c_siphash_append(&hash, (const uint8_t *)&ts.tv_sec, sizeof(ts.tv_sec)); + c_siphash_append(&hash, (const uint8_t *)&ts.tv_nsec, sizeof(ts.tv_nsec)); + + *random = c_siphash_finalize(&hash); + return 0; +} + +static int n_acd_socket_new(int *fdp, int fd_bpf_prog, NAcdConfig *config) { + const struct sockaddr_ll address = { + .sll_family = AF_PACKET, + .sll_protocol = htobe16(ETH_P_ARP), + .sll_ifindex = config->ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + int r, s = -1; + + s = socket(PF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (s < 0) { + r = -c_errno(); + goto error; + } + + if (fd_bpf_prog >= 0) { + r = setsockopt(s, SOL_SOCKET, SO_ATTACH_BPF, &fd_bpf_prog, sizeof(fd_bpf_prog)); + if (r < 0) { + r = -c_errno(); + goto error; + } + } + + r = bind(s, (struct sockaddr *)&address, sizeof(address)); + if (r < 0) { + r = -c_errno(); + goto error; + } + + *fdp = s; + s = -1; + return 0; + +error: + if (s >= 0) + close(s); + return r; +} + +/** + * n_acd_config_new() - create configuration object + * @configp: output argument for new configuration + * + * This creates a new configuration object and provides it to the caller. The + * object is fully owned by the caller upon function return. + * + * A configuration object is a passive structure that is used to collect + * information that is then passed to a constructor or other function. A + * configuration never validates the data, but it is up to the consumer of a + * configuration to do that. + * + * Return: 0 on success, negative error code on failure. + */ +_c_public_ int n_acd_config_new(NAcdConfig **configp) { + _c_cleanup_(n_acd_config_freep) NAcdConfig *config = NULL; + + config = malloc(sizeof(*config)); + if (!config) + return -ENOMEM; + + *config = (NAcdConfig)N_ACD_CONFIG_NULL(*config); + + *configp = config; + config = NULL; + return 0; +} + +/** + * n_acd_config_free() - destroy configuration object + * @config: configuration to operate on, or NULL + * + * This destroys the configuration object @config. If @config is NULL, this is + * a no-op. + * + * Return: NULL is returned. + */ +_c_public_ NAcdConfig *n_acd_config_free(NAcdConfig *config) { + if (!config) + return NULL; + + free(config); + + return NULL; +} + +/** + * n_acd_config_set_ifindex() - set ifindex property + * @config: configuration to operate on + * @ifindex: ifindex to set + * + * This sets the @ifindex property of the configuration object. Any previous + * value is overwritten. + * + * A valid ifindex is a 32bit integer greater than 0. Any other value is + * treated as unspecified. + * + * The ifindex corresponds to the interface index provided by the linux kernel. + * It specifies the network device to be used. + */ +_c_public_ void n_acd_config_set_ifindex(NAcdConfig *config, int ifindex) { + config->ifindex = ifindex; +} + +/** + * n_acd_config_set_transport() - set transport property + * @config: configuration to operate on + * @transport: transport to set + * + * This specifies the transport to use. A transport must be one of the + * `N_ACD_TRANSPORT_*` identifiers. It selects which transport protocol `n-acd` + * will run on. + */ +_c_public_ void n_acd_config_set_transport(NAcdConfig *config, unsigned int transport) { + config->transport = transport; +} + +/** + * n_acd_config_set_mac() - set mac property + * @config: configuration to operate on + * @mac: mac to set + * + * This specifies the hardware address (also referred to as `MAC Address`) to + * use. Any hardware address can be specified. It is the caller's + * responsibility to make sure the address can actually be used. + * + * The address in @mac is copied into @config. It does not have to be retained + * by the caller. + */ +_c_public_ void n_acd_config_set_mac(NAcdConfig *config, const uint8_t *mac, size_t n_mac) { + /* + * We truncate the address at the maximum we support. We still remember + * the original length, so any consumer of this configuration can then + * complain about an unsupported address length. This allows us to + * avoid a memory allocation here and having to return `int`. + */ + config->n_mac = n_mac; + memcpy(config->mac, mac, n_mac > ETH_ALEN ? ETH_ALEN : n_mac); +} + +int n_acd_event_node_new(NAcdEventNode **nodep) { + NAcdEventNode *node; + + node = malloc(sizeof(*node)); + if (!node) + return -ENOMEM; + + *node = (NAcdEventNode)N_ACD_EVENT_NODE_NULL(*node); + + *nodep = node; + return 0; +} + +NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node) { + if (!node) + return NULL; + + c_list_unlink(&node->probe_link); + c_list_unlink(&node->acd_link); + free(node); + + return NULL; +} + +int n_acd_ensure_bpf_map_space(NAcd *acd) { + NAcdProbe *probe; + _c_cleanup_(c_closep) int fd_map = -1, fd_prog = -1; + size_t max_map; + int r; + + if (acd->n_bpf_map < acd->max_bpf_map) + return 0; + + max_map = 2 * acd->max_bpf_map; + + r = n_acd_bpf_map_create(&fd_map, max_map); + if (r) + return r; + + c_rbtree_for_each_entry(probe, &acd->ip_tree, ip_node) { + r = n_acd_bpf_map_add(fd_map, &probe->ip); + if (r) + return r; + } + + r = n_acd_bpf_compile(&fd_prog, fd_map, (struct ether_addr*) acd->mac); + if (r) + return r; + + if (fd_prog >= 0) { + r = setsockopt(acd->fd_socket, SOL_SOCKET, SO_ATTACH_BPF, &fd_prog, sizeof(fd_prog)); + if (r) + return -c_errno(); + } + + if (acd->fd_bpf_map >= 0) + close(acd->fd_bpf_map); + acd->fd_bpf_map = fd_map; + fd_map = -1; + acd->max_bpf_map = max_map; + return 0; +} + +/** + * n_acd_new() - create a new ACD context + * @acdp: output argument for new context object + * @config: configuration parameters + * + * Create a new ACD context and return it in @acdp. The configuration @config + * must be initialized by the caller and must specify a valid network + * interface, transport mechanism, as well as hardware address compatible with + * the selected transport. The configuration is copied into the context. The + * @config object thus does not have to be retained by the caller. + * + * Return: 0 on success, negative error code on failure. + */ +_c_public_ int n_acd_new(NAcd **acdp, NAcdConfig *config) { + _c_cleanup_(n_acd_unrefp) NAcd *acd = NULL; + _c_cleanup_(c_closep) int fd_bpf_prog = -1; + struct epoll_event eevent; + int r; + + if (config->ifindex <= 0 || + config->transport != N_ACD_TRANSPORT_ETHERNET || + config->n_mac != ETH_ALEN || + !memcmp(config->mac, (uint8_t[ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ETH_ALEN)) + return N_ACD_E_INVALID_ARGUMENT; + + acd = malloc(sizeof(*acd)); + if (!acd) + return -ENOMEM; + + *acd = (NAcd)N_ACD_NULL(*acd); + acd->ifindex = config->ifindex; + memcpy(acd->mac, config->mac, ETH_ALEN); + + r = n_acd_get_random(&acd->seed); + if (r) + return r; + + acd->fd_epoll = epoll_create1(EPOLL_CLOEXEC); + if (acd->fd_epoll < 0) + return -c_errno(); + + r = timer_init(&acd->timer); + if (r < 0) + return r; + + acd->max_bpf_map = 8; + + r = n_acd_bpf_map_create(&acd->fd_bpf_map, acd->max_bpf_map); + if (r) + return r; + + r = n_acd_bpf_compile(&fd_bpf_prog, acd->fd_bpf_map, (struct ether_addr*) acd->mac); + if (r) + return r; + + r = n_acd_socket_new(&acd->fd_socket, fd_bpf_prog, config); + if (r) + return r; + + eevent = (struct epoll_event){ + .events = EPOLLIN, + .data.u32 = N_ACD_EPOLL_TIMER, + }; + r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->timer.fd, &eevent); + if (r < 0) + return -c_errno(); + + eevent = (struct epoll_event){ + .events = EPOLLIN, + .data.u32 = N_ACD_EPOLL_SOCKET, + }; + r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->fd_socket, &eevent); + if (r < 0) + return -c_errno(); + + *acdp = acd; + acd = NULL; + return 0; +} + +static void n_acd_free_internal(NAcd *acd) { + NAcdEventNode *node, *t_node; + + if (!acd) + return; + + c_list_for_each_entry_safe(node, t_node, &acd->event_list, acd_link) + n_acd_event_node_free(node); + + c_assert(c_rbtree_is_empty(&acd->ip_tree)); + + if (acd->fd_socket >= 0) { + c_assert(acd->fd_epoll >= 0); + epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_socket, NULL); + close(acd->fd_socket); + acd->fd_socket = -1; + } + + if (acd->fd_bpf_map >= 0) { + close(acd->fd_bpf_map); + acd->fd_bpf_map = -1; + } + + if (acd->timer.fd >= 0) { + c_assert(acd->fd_epoll >= 0); + epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->timer.fd, NULL); + timer_deinit(&acd->timer); + } + + if (acd->fd_epoll >= 0) { + close(acd->fd_epoll); + acd->fd_epoll = -1; + } + + free(acd); +} + +/** + * n_acd_ref() - acquire reference + * @acd: context to operate on, or NULL + * + * This acquires a single reference to the context specified as @acd. If @acd + * is NULL, this is a no-op. + * + * Return: @acd is returned. + */ +_c_public_ NAcd *n_acd_ref(NAcd *acd) { + if (acd) + ++acd->n_refs; + return acd; +} + +/** + * n_acd_unref() - release reference + * @acd: context to operate on, or NULL + * + * This releases a single reference to the context @acd. If this is the last + * reference, the context is torn down and deallocated. + * + * Return: NULL is returned. + */ +_c_public_ NAcd *n_acd_unref(NAcd *acd) { + if (acd && !--acd->n_refs) + n_acd_free_internal(acd); + return NULL; +} + +int n_acd_raise(NAcd *acd, NAcdEventNode **nodep, unsigned int event) { + NAcdEventNode *node; + int r; + + r = n_acd_event_node_new(&node); + if (r) + return r; + + node->event.event = event; + c_list_link_tail(&acd->event_list, &node->acd_link); + + if (nodep) + *nodep = node; + return 0; +} + +int n_acd_send(NAcd *acd, const struct in_addr *tpa, const struct in_addr *spa) { + struct sockaddr_ll address = { + .sll_family = AF_PACKET, + .sll_protocol = htobe16(ETH_P_ARP), + .sll_ifindex = acd->ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + struct ether_arp arp = { + .ea_hdr = { + .ar_hrd = htobe16(ARPHRD_ETHER), + .ar_pro = htobe16(ETHERTYPE_IP), + .ar_hln = sizeof(acd->mac), + .ar_pln = sizeof(uint32_t), + .ar_op = htobe16(ARPOP_REQUEST), + }, + }; + ssize_t l; + int r; + + memcpy(arp.arp_sha, acd->mac, sizeof(acd->mac)); + memcpy(arp.arp_tpa, &tpa->s_addr, sizeof(uint32_t)); + + if (spa) + memcpy(arp.arp_spa, &spa->s_addr, sizeof(spa->s_addr)); + + l = sendto(acd->fd_socket, + &arp, + sizeof(arp), + MSG_NOSIGNAL, + (struct sockaddr *)&address, + sizeof(address)); + if (l < 0) { + if (errno == EAGAIN || errno == ENOBUFS) { + /* + * We never maintain outgoing queues. We rely on the + * network device to do that for us. In case the queues + * are full, or the kernel refuses to queue the packet + * for other reasons, we must tell our caller that the + * packet was dropped. + */ + return N_ACD_E_DROPPED; + } else if (errno == ENETDOWN || errno == ENXIO) { + /* + * These errors happen if the network device went down + * or was actually removed. We always propagate this as + * event, so the user can react accordingly (similarly + * to the recvmmsg(2) handler). In case the user does + * not immediately react, we also tell our caller that + * the packet was dropped, so we don't erroneously + * treat this as success. + */ + + r = n_acd_raise(acd, NULL, N_ACD_EVENT_DOWN); + if (r) + return r; + + return N_ACD_E_DROPPED; + } + + /* + * Random network error. We treat this as fatal and propagate + * the error, so it is noticed and can be investigated. + */ + return -c_errno(); + } else if (l != (ssize_t)sizeof(arp)) { + /* + * Ugh, the kernel modified the packet. This is unexpected. We + * consider the packet lost. + */ + return N_ACD_E_DROPPED; + } + + return 0; +} + +/** + * n_acd_get_fd() - get pollable file descriptor + * @acd: context object to operate on + * @fdp: output argument for file descriptor + * + * This returns the backing file-descriptor of the context object @acd. The + * file-descriptor is owned by @acd and valid as long as @acd is. The + * file-descriptor never changes, so it can be cached by the caller as long as + * they hold a reference to @acd. + * + * The file-descriptor is internal to the @acd context and should not be + * modified by the caller. It is only exposed to allow the caller to poll on + * it. Whenever the file-descriptor polls readable, n_acd_dispatch() should be + * called. + * + * Currently, the file-descriptor is an epoll-fd. + */ +_c_public_ void n_acd_get_fd(NAcd *acd, int *fdp) { + *fdp = acd->fd_epoll; +} + +static int n_acd_handle_timeout(NAcd *acd) { + NAcdProbe *probe; + uint64_t now; + int r; + + /* + * Read the current time once, and handle all timeouts that triggered + * before the current time. Rereading the current time in each loop + * might risk creating a live-lock, and the fact that we read the + * time after reading the timer guarantees that the timeout which + * woke us up is handled. + * + * When there are no more timeouts to handle at the given time, we + * rearm the timer to potentially wake us up again in the future. + */ + timer_now(&acd->timer, &now); + + for (;;) { + Timeout *timeout; + + r = timer_pop_timeout(&acd->timer, now, &timeout); + if (r < 0) { + return r; + } else if (!timeout) { + /* + * There are no more timeouts pending before @now. Rearm + * the timer to fire again at the next timeout. + */ + timer_rearm(&acd->timer); + break; + } + + probe = (void *)timeout - offsetof(NAcdProbe, timeout); + r = n_acd_probe_handle_timeout(probe); + if (r) + return r; + } + + return 0; +} + +static int n_acd_handle_packet(NAcd *acd, struct ether_arp *packet) { + bool hard_conflict; + NAcdProbe *probe; + uint32_t addr; + CRBNode *node; + int r; + + /* + * We are interested in 2 kinds of ARP messages: + * + * 1) Someone who is *NOT* us sends *ANY* ARP message with our IP + * address as sender. This is never good, because it implies an + * address conflict. + * We call this a hard-conflict. + * + * 2) Someone who is *NOT* us sends an ARP REQUEST without any sender + * IP, but our IP as target. This implies someone else performs an + * ARP Probe with our address. This also implies a conflict, but + * one that can be resolved by responding to the probe. + * We call this a soft-conflict. + * + * We are never interested in any other ARP message. The kernel already + * deals with everything else, hence, we can silently ignore those. + * + * Now, we simply check whether a sender-address is set. This allows us + * to distinguish both cases. We then check further conditions, so we + * can bail out early if neither is the case. + * + * Lastly, we perform a lookup in our probe-set to check whether the + * address actually matches, so we can let these probes dispatch the + * message. Note that we allow duplicate probes, so we need to dispatch + * each matching probe, not just one. + */ + + if (memcmp(packet->arp_spa, (uint8_t[4]){ }, sizeof(packet->arp_spa))) { + memcpy(&addr, packet->arp_spa, sizeof(addr)); + hard_conflict = true; + } else if (packet->ea_hdr.ar_op == htobe16(ARPOP_REQUEST)) { + memcpy(&addr, packet->arp_tpa, sizeof(addr)); + hard_conflict = false; + } else { + /* + * The BPF filter will not let through any other packet. + */ + return -EIO; + } + + /* Find top-most node that matches @addr. */ + node = acd->ip_tree.root; + while (node) { + probe = c_rbnode_entry(node, NAcdProbe, ip_node); + if (addr < probe->ip.s_addr) + node = node->left; + else if (addr > probe->ip.s_addr) + node = node->right; + else + break; + } + + /* + * If the address is unknown, we drop the package. This might happen if + * the kernel queued the packet and passed the BPF filter, but we + * modified the set before dequeuing the message. + */ + if (!node) + return 0; + + /* Forward to left-most child that still matches @addr. */ + while (node->left && addr == c_rbnode_entry(node->left, + NAcdProbe, + ip_node)->ip.s_addr) + node = node->left; + + /* Iterate all matching entries in-order. */ + do { + probe = c_rbnode_entry(node, NAcdProbe, ip_node); + + r = n_acd_probe_handle_packet(probe, packet, hard_conflict); + if (r) + return r; + + node = c_rbnode_next(node); + } while (node && addr == c_rbnode_entry(node, + NAcdProbe, + ip_node)->ip.s_addr); + + return 0; +} + +static int n_acd_dispatch_timer(NAcd *acd, struct epoll_event *event) { + int r; + + if (event->events & (EPOLLHUP | EPOLLERR)) { + /* + * There is no way to handle either gracefully. If we ignored + * them, we would busy-loop, so lets rather forward the error + * to the caller. + */ + return -EIO; + } + + if (event->events & EPOLLIN) { + r = timer_read(&acd->timer); + if (r <= 0) + return r; + + c_assert(r == TIMER_E_TRIGGERED); + + /* + * A timer triggered, handle all pending timeouts at a given + * point in time. There can only be a finite number of pending + * timeouts, any new ones will be in the future, so not handled + * now, but guaranteed to wake us up again when they do trigger. + */ + r = n_acd_handle_timeout(acd); + if (r) + return r; + } + + return 0; +} + +static bool n_acd_packet_is_valid(NAcd *acd, void *packet, size_t n_packet) { + struct ether_arp *arp; + + /* + * The eBPF filter will ensure that this function always returns true, however, + * this allows the eBPF filter to be an optional optimization which is necessary + * on older kernels. + * + * See comments in n-acd-bpf.c for details. + */ + + if (n_packet != sizeof(*arp)) + return false; + + arp = packet; + + if (arp->arp_hrd != htobe16(ARPHRD_ETHER)) + return false; + + if (arp->arp_pro != htobe16(ETHERTYPE_IP)) + return false; + + if (arp->arp_hln != sizeof(struct ether_addr)) + return false; + + if (arp->arp_pln != sizeof(struct in_addr)) + return false; + + if (!memcmp(arp->arp_sha, acd->mac, sizeof(struct ether_addr))) + return false; + + if (memcmp(arp->arp_spa, &((struct in_addr) { INADDR_ANY }), sizeof(struct in_addr))) { + if (arp->arp_op != htobe16(ARPOP_REQUEST) && arp->arp_op != htobe16(ARPOP_REPLY)) + return false; + } else if (arp->arp_op != htobe16(ARPOP_REQUEST)) { + return false; + } + + return true; +} + +static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) { + const size_t n_batch = 8; + struct mmsghdr msgs[n_batch]; + struct iovec iovecs[n_batch]; + struct ether_arp data[n_batch]; + size_t i; + int r, n; + + for (i = 0; i < n_batch; ++i) { + iovecs[i].iov_base = data + i; + iovecs[i].iov_len = sizeof(data[i]); + msgs[i].msg_hdr = (struct msghdr){ + .msg_iov = iovecs + i, + .msg_iovlen = 1, + }; + } + + /* + * We always directly call into recvmmsg(2), regardless which EPOLL* + * event is signalled. On sockets, the recv(2)-family of syscalls does + * a suitable job of handling all possible scenarios and telling us + * about it. Hence, lets take the easy route and always ask the kernel + * about the current state. + */ + n = recvmmsg(acd->fd_socket, msgs, n_batch, 0, NULL); + if (n < 0) { + if (errno == ENETDOWN) { + /* + * We get ENETDOWN if the network-device goes down or + * is removed. This error is temporary and only queued + * once. Subsequent reads will simply return EAGAIN + * until the device is up again and has data queued. + * Usually, the caller should tear down all probes when + * an interface goes down, but we leave it up to the + * caller to decide what to do. We propagate the code + * and continue. + */ + return n_acd_raise(acd, NULL, N_ACD_EVENT_DOWN); + } else if (errno == EAGAIN) { + /* + * There is no more data queued and we did not get + * preempted. Everything is good to go. + * As a safety-net against busy-looping, we do check + * for HUP/ERR. Neither should be set, since they imply + * error-dequeue behavior on all socket calls. Lets + * fail hard if we trigger it, so we can investigate. + */ + if (event->events & (EPOLLHUP | EPOLLERR)) + return -EIO; + + return 0; + } else { + /* + * Something went wrong. Propagate the error-code, so + * this can be investigated. + */ + return -c_errno(); + } + } else if (n >= (ssize_t)n_batch) { + /* + * If all buffers were filled with data, we cannot be sure that + * there is nothing left to read. But to avoid starvation, we + * cannot loop on this condition. Instead, we mark the context + * as preempted so the caller can call us again. + * Note that in level-triggered event-loops this condition can + * be neglected, but in edge-triggered event-loops it is + * crucial to forward this information. + * + * On the other hand, there are several conditions where the + * kernel might return less batches than requested, but was + * still preempted. However, all of those cases require the + * preemption to have triggered a wakeup *after* we entered + * recvmmsg(). Hence, even if we did not recognize the + * preemption, an edge must have triggered and as such we will + * handle the event on the next turn. + */ + acd->preempted = true; + } + + for (i = 0; (ssize_t)i < n; ++i) { + if (!n_acd_packet_is_valid(acd, data + i, msgs[i].msg_len)) + continue; + /* + * Handle the packet. Bail out if something went wrong. Note + * that this must be fatal errors, since we discard all other + * packets that follow. + */ + r = n_acd_handle_packet(acd, data + i); + if (r) + return r; + } + + return 0; +} + +/** + * n_acd_dispatch() - dispatch context + * @acd: context object to operate on + * + * This dispatches the internal state-machine of all probes and operations + * running on the context @acd. + * + * Any outside effect or event triggered by this dispatcher will be queued on + * the event-queue of @acd. Whenever the dispatcher returns, the caller is + * required to drain the event-queue via n_acd_pop_event() until it is empty. + * + * This function dispatches as many events as possible up to a static limit to + * prevent stalling execution. If the static limit is reached, this function + * will return with N_ACD_E_PREEMPTED, otherwise 0 is returned. In most cases + * preemption can be ignored, because level-triggered event notification + * handles it automatically. However, in case of edge-triggered event + * mechanisms, the caller must make sure to call the dispatcher again. + * + * Return: 0 on success, N_ACD_E_PREEMPTED on preemption, negative error code + * on failure. + */ +_c_public_ int n_acd_dispatch(NAcd *acd) { + struct epoll_event events[2]; + int n, i, r = 0; + + n = epoll_wait(acd->fd_epoll, events, sizeof(events) / sizeof(*events), 0); + if (n < 0) { + /* Linux never returns EINTR if `timeout == 0'. */ + return -c_errno(); + } + + acd->preempted = false; + + for (i = 0; i < n; ++i) { + switch (events[i].data.u32) { + case N_ACD_EPOLL_TIMER: + r = n_acd_dispatch_timer(acd, events + i); + break; + case N_ACD_EPOLL_SOCKET: + r = n_acd_dispatch_socket(acd, events + i); + break; + default: + c_assert(0); + r = 0; + break; + } + + if (r) + return r; + } + + return acd->preempted ? N_ACD_E_PREEMPTED : 0; +} + +/** + * n_acd_pop_event() - get the next pending event + * @acd: context object to operate on + * @eventp: output argument for the event + * + * Returns a pointer to the next pending event. The event is still owend by + * the context, and is only valid until the next call to n_acd_pop_event() + * or until the owning object is freed (either the ACD context or the indicated + * probe object). + * + * An event either originates on the ACD context, or one of the configured + * probes. If the event-type has a 'probe' pointer, it originated on the + * indicated probe (which is *never* NULL), otherwise it originated on the + * context. + * + * Users must call this function repeatedly until either an error is returned, + * or the event-pointer is NULL. Wakeups on the epoll-fd are only guaranteed + * for each batch of events. Hence, it is the callers responsibility to drain + * the event-queue somehow after each call to n_acd_dispatch(). Note that + * events can only be added by n_acd_dispatch(), hence, you cannot live-lock + * when draining the event queue. + * + * The possible events are: + * * N_ACD_EVENT_READY: A configured IP address was probed successfully + * and is ready to be used. Once configured on the + * interface, the caller must call n_acd_announce() + * to announce and start defending the address. + * * N_ACD_EVENT_USED: Someone is already using the IP address being + * probed. The probe is put into stopped state and + * should be freed by the caller. + * * N_ACD_EVENT_DEFENDED: A conflict was detected for an announced IP + * address, and the engine attempted to defend it. + * This is purely informational, and no action is + * required by the caller. + * * N_ACD_EVENT_CONFLICT: A conflict was detected for an announced IP + * address, and the probe was not able to defend + * it (according to the configured policy). The + * probe halted, the caller must stop using + * the address immediately, and should free the probe. + * * N_ACD_EVENT_DOWN: The specified network interface was put down. The + * user is recommended to free *ALL* probes and + * recreate them as soon as the interface is up again. + * Note that this event is purely informational. The + * probes will continue running, but all packets will + * be blackholed, and no network packets are received, + * until the network is back up again. Hence, from an + * operational perspective, the legitimacy of the ACD + * probes is lost and the user better re-probes all + * addresses. + * + * Returns: 0 on success, negative error code on failure. The popped event is + * returned in @eventp. If no event is pending, NULL is placed in + * @eventp and 0 is returned. If an error is returned, @eventp is left + * untouched. + */ +_c_public_ int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp) { + NAcdEventNode *node, *t_node; + + c_list_for_each_entry_safe(node, t_node, &acd->event_list, acd_link) { + if (node->is_public) { + n_acd_event_node_free(node); + continue; + } + + node->is_public = true; + *eventp = &node->event; + return 0; + } + + *eventp = NULL; + return 0; +} + +/** + * n_acd_probe() - start new probe + * @acd: context object to operate on + * @probep: output argument for new probe + * @config: probe configuration + * + * This creates a new probe on the context @acd and returns the probe in + * @probep. The configuration @config must provide valid probe parameters. At + * least a valid IP address must be provided through the configuration. + * + * This function does not reject duplicate probes for the same address. It is + * the caller's decision whether duplicates are allowed or not. But note that + * duplicate probes on the same context will not conflict each other. That is, + * running a probe for the same address twice on the same context will not + * cause them to consider each other a duplicate. + * + * Probes are rather lightweight objects. They do not create any + * file-descriptors or other kernel objects. Probes always re-use the + * infrastructure provided by the context object @acd. This allows running many + * probes simultaneously without exhausting resources. + * + * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT on invalid configuration + * parameters, negative error code on failure. + */ +_c_public_ int n_acd_probe(NAcd *acd, NAcdProbe **probep, NAcdProbeConfig *config) { + return n_acd_probe_new(probep, acd, config); +} diff --git a/src/n-acd.h b/src/n-acd.h new file mode 100644 index 0000000000..e2b01270fa --- /dev/null +++ b/src/n-acd.h @@ -0,0 +1,150 @@ +#pragma once + +/* + * IPv4 Address Conflict Detection + * + * This is the public header of the n-acd library, implementing IPv4 Address + * Conflict Detection as described in RFC-5227. This header defines the public + * API and all entry points of n-acd. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +typedef struct NAcd NAcd; +typedef struct NAcdConfig NAcdConfig; +typedef struct NAcdEvent NAcdEvent; +typedef struct NAcdProbe NAcdProbe; +typedef struct NAcdProbeConfig NAcdProbeConfig; + +#define N_ACD_TIMEOUT_RFC5227 (UINT64_C(9000)) + +enum { + _N_ACD_E_SUCCESS, + + N_ACD_E_PREEMPTED, + N_ACD_E_INVALID_ARGUMENT, + + _N_ACD_E_N, +}; + +enum { + N_ACD_TRANSPORT_ETHERNET, + _N_ACD_TRANSPORT_N, +}; + +enum { + N_ACD_EVENT_READY, + N_ACD_EVENT_USED, + N_ACD_EVENT_DEFENDED, + N_ACD_EVENT_CONFLICT, + N_ACD_EVENT_DOWN, + _N_ACD_EVENT_N, +}; + +enum { + N_ACD_DEFEND_NEVER, + N_ACD_DEFEND_ONCE, + N_ACD_DEFEND_ALWAYS, + _N_ACD_DEFEND_N, +}; + +struct NAcdEvent { + unsigned int event; + union { + struct { + NAcdProbe *probe; + } ready; + struct { + } down; + struct { + NAcdProbe *probe; + uint8_t *sender; + size_t n_sender; + } used, defended, conflict; + }; +}; + +/* configs */ + +int n_acd_config_new(NAcdConfig **configp); +NAcdConfig *n_acd_config_free(NAcdConfig *config); + +void n_acd_config_set_ifindex(NAcdConfig *config, int ifindex); +void n_acd_config_set_transport(NAcdConfig *config, unsigned int transport); +void n_acd_config_set_mac(NAcdConfig *config, const uint8_t *mac, size_t n_mac); + +int n_acd_probe_config_new(NAcdProbeConfig **configp); +NAcdProbeConfig *n_acd_probe_config_free(NAcdProbeConfig *config); + +void n_acd_probe_config_set_ip(NAcdProbeConfig *config, struct in_addr ip); +void n_acd_probe_config_set_timeout(NAcdProbeConfig *config, uint64_t msecs); + +/* contexts */ + +int n_acd_new(NAcd **acdp, NAcdConfig *config); +NAcd *n_acd_ref(NAcd *acd); +NAcd *n_acd_unref(NAcd *acd); + +void n_acd_get_fd(NAcd *acd, int *fdp); +int n_acd_dispatch(NAcd *acd); +int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp); + +int n_acd_probe(NAcd *acd, NAcdProbe **probep, NAcdProbeConfig *config); + +/* probes */ + +NAcdProbe *n_acd_probe_free(NAcdProbe *probe); + +void n_acd_probe_set_userdata(NAcdProbe *probe, void *userdata); +void n_acd_probe_get_userdata(NAcdProbe *probe, void **userdatap); + +int n_acd_probe_announce(NAcdProbe *probe, unsigned int defend); + +/* inline helpers */ + +static inline void n_acd_config_freep(NAcdConfig **config) { + if (*config) + n_acd_config_free(*config); +} + +static inline void n_acd_config_freev(NAcdConfig *config) { + n_acd_config_free(config); +} + +static inline void n_acd_probe_config_freep(NAcdProbeConfig **config) { + if (*config) + n_acd_probe_config_free(*config); +} + +static inline void n_acd_probe_config_freev(NAcdProbeConfig *config) { + n_acd_probe_config_free(config); +} + +static inline void n_acd_unrefp(NAcd **acd) { + if (*acd) + n_acd_unref(*acd); +} + +static inline void n_acd_unrefv(NAcd *acd) { + n_acd_unref(acd); +} + +static inline void n_acd_probe_freep(NAcdProbe **probe) { + if (*probe) + n_acd_probe_free(*probe); +} + +static inline void n_acd_probe_freev(NAcdProbe *probe) { + n_acd_probe_free(probe); +} + +#ifdef __cplusplus +} +#endif diff --git a/src/test-api.c b/src/test-api.c new file mode 100644 index 0000000000..70f7520836 --- /dev/null +++ b/src/test-api.c @@ -0,0 +1,88 @@ +/* + * Tests for n-acd API + * This verifies the visibility and availability of the public API. + */ + +#undef NDEBUG +#include +#include +#include "n-acd.h" + +static void test_api_constants(void) { + assert(1 + N_ACD_TIMEOUT_RFC5227); + + assert(1 + _N_ACD_E_SUCCESS); + assert(1 + N_ACD_E_PREEMPTED); + assert(1 + N_ACD_E_INVALID_ARGUMENT); + assert(1 + _N_ACD_E_N); + + assert(1 + N_ACD_TRANSPORT_ETHERNET); + assert(1 + _N_ACD_TRANSPORT_N); + + assert(1 + N_ACD_EVENT_READY); + assert(1 + N_ACD_EVENT_USED); + assert(1 + N_ACD_EVENT_DEFENDED); + assert(1 + N_ACD_EVENT_CONFLICT); + assert(1 + N_ACD_EVENT_DOWN); + assert(1 + _N_ACD_EVENT_N); + + assert(1 + N_ACD_DEFEND_NEVER); + assert(1 + N_ACD_DEFEND_ONCE); + assert(1 + N_ACD_DEFEND_ALWAYS); + assert(1 + _N_ACD_DEFEND_N); +} + +static void test_api_types(void) { + assert(sizeof(NAcdEvent*)); + assert(sizeof(NAcdConfig*)); + assert(sizeof(NAcdProbeConfig*)); + assert(sizeof(NAcd*)); + assert(sizeof(NAcdProbe*)); +} + +static void test_api_functions(void) { + void *fns[] = { + (void *)n_acd_config_new, + (void *)n_acd_config_free, + (void *)n_acd_config_set_ifindex, + (void *)n_acd_config_set_transport, + (void *)n_acd_config_set_mac, + (void *)n_acd_probe_config_new, + (void *)n_acd_probe_config_free, + (void *)n_acd_probe_config_set_ip, + (void *)n_acd_probe_config_set_timeout, + + (void *)n_acd_new, + (void *)n_acd_ref, + (void *)n_acd_unref, + (void *)n_acd_get_fd, + (void *)n_acd_dispatch, + (void *)n_acd_pop_event, + (void *)n_acd_probe, + + (void *)n_acd_probe_free, + (void *)n_acd_probe_set_userdata, + (void *)n_acd_probe_get_userdata, + (void *)n_acd_probe_announce, + + (void *)n_acd_config_freep, + (void *)n_acd_config_freev, + (void *)n_acd_probe_config_freep, + (void *)n_acd_probe_config_freev, + (void *)n_acd_unrefp, + (void *)n_acd_unrefv, + (void *)n_acd_probe_freep, + (void *)n_acd_probe_freev, + }; + size_t i; + + for (i = 0; i < sizeof(fns) / sizeof(*fns); ++i) + assert(!!fns[i]); +} + +int main(int argc, char **argv) { + test_api_constants(); + test_api_types(); + test_api_functions(); + return 0; +} diff --git a/src/test-bpf.c b/src/test-bpf.c new file mode 100644 index 0000000000..78f9d0f19c --- /dev/null +++ b/src/test-bpf.c @@ -0,0 +1,226 @@ +/* + * eBPF socket filter tests + */ + +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" +#include "n-acd-private.h" +#include "test.h" + +#define ETHER_ARP_PACKET_INIT(_op, _mac, _sip, _tip) { \ + .ea_hdr = { \ + .ar_hrd = htobe16(ARPHRD_ETHER), \ + .ar_pro = htobe16(ETHERTYPE_IP), \ + .ar_hln = 6, \ + .ar_pln = 4, \ + .ar_op = htobe16(_op), \ + }, \ + .arp_sha[0] = (_mac)->ether_addr_octet[0], \ + .arp_sha[1] = (_mac)->ether_addr_octet[1], \ + .arp_sha[2] = (_mac)->ether_addr_octet[2], \ + .arp_sha[3] = (_mac)->ether_addr_octet[3], \ + .arp_sha[4] = (_mac)->ether_addr_octet[4], \ + .arp_sha[5] = (_mac)->ether_addr_octet[5], \ + .arp_spa[0] = (be32toh((_sip)->s_addr) >> 24) & 0xff, \ + .arp_spa[1] = (be32toh((_sip)->s_addr) >> 16) & 0xff, \ + .arp_spa[2] = (be32toh((_sip)->s_addr) >> 8) & 0xff, \ + .arp_spa[3] = be32toh((_sip)->s_addr) & 0xff, \ + .arp_tpa[0] = (be32toh((_tip)->s_addr) >> 24) & 0xff, \ + .arp_tpa[1] = (be32toh((_tip)->s_addr) >> 16) & 0xff, \ + .arp_tpa[2] = (be32toh((_tip)->s_addr) >> 8) & 0xff, \ + .arp_tpa[3] = be32toh((_tip)->s_addr) & 0xff, \ + } + +static void test_map(void) { + int r, mapfd = -1; + struct in_addr addr = { 1 }; + + r = n_acd_bpf_map_create(&mapfd, 8); + c_assert(r >= 0); + c_assert(mapfd >= 0); + + r = n_acd_bpf_map_remove(mapfd, &addr); + c_assert(r == -ENOENT); + + r = n_acd_bpf_map_add(mapfd, &addr); + c_assert(r >= 0); + + r = n_acd_bpf_map_add(mapfd, &addr); + c_assert(r == -EEXIST); + + r = n_acd_bpf_map_remove(mapfd, &addr); + c_assert(r >= 0); + + r = n_acd_bpf_map_remove(mapfd, &addr); + c_assert(r == -ENOENT); + + close(mapfd); +} + +static void verify_success(struct ether_arp *packet, int out_fd, int in_fd) { + uint8_t buf[sizeof(struct ether_arp)]; + int r; + + r = send(out_fd, packet, sizeof(struct ether_arp), 0); + c_assert(r == sizeof(struct ether_arp)); + + r = recv(in_fd, buf, sizeof(buf), 0); + c_assert(r == sizeof(struct ether_arp)); +} + +static void verify_failure(struct ether_arp *packet, int out_fd, int in_fd) { + uint8_t buf[sizeof(struct ether_arp)]; + int r; + + r = send(out_fd, packet, sizeof(struct ether_arp), 0); + c_assert(r == sizeof(struct ether_arp)); + + r = recv(in_fd, buf, sizeof(buf), 0); + c_assert(r < 0); + c_assert(errno == EAGAIN); +} + +static void test_filter(void) { + uint8_t buf[sizeof(struct ether_arp) + 1] = {}; + struct ether_addr mac1 = { { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 } }; + struct ether_addr mac2 = { { 0x01, 0x02, 0x03, 0x04, 0x05, 0x07 } }; + struct in_addr ip0 = { 0 }; + struct in_addr ip1 = { 1 }; + struct in_addr ip2 = { 2 }; + struct ether_arp *packet = (struct ether_arp *)buf; + int r, mapfd = -1, progfd = -1, pair[2]; + + r = n_acd_bpf_map_create(&mapfd, 1); + c_assert(r >= 0); + + r = n_acd_bpf_compile(&progfd, mapfd, &mac1); + c_assert(r >= 0); + c_assert(progfd >= 0); + + r = socketpair(AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, pair); + c_assert(r >= 0); + + r = setsockopt(pair[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, + sizeof(progfd)); + c_assert(r >= 0); + + r = n_acd_bpf_map_add(mapfd, &ip1); + c_assert(r >= 0); + + /* valid */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + verify_success(packet, pair[0], pair[1]); + + /* valid: reply instead of request */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REPLY, &mac2, &ip1, &ip2); + verify_success(packet, pair[0], pair[1]); + + /* valid: to us instead of from us */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip0, &ip1); + verify_success(packet, pair[0], pair[1]); + + /* invalid header type */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_hrd += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid protocol */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_pro += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid hw addr length */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_hln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid protocol addr length */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_pln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid operation */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_NAK, &mac2, &ip1, &ip2); + packet->arp_hln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* own mac */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac1, &ip1, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* not to, nor from us, with source */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip2, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* not to, nor from us, without source */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip0, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* to us instead of from us, but reply */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REPLY, &mac2, &ip0, &ip1); + verify_failure(packet, pair[0], pair[1]); + + /* long */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp) + 1, 0); + c_assert(r == sizeof(struct ether_arp) + 1); + + r = recv(pair[1], buf, sizeof(buf), 0); + c_assert(r == sizeof(struct ether_arp)); + + /* short */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp) - 1, 0); + c_assert(r == sizeof(struct ether_arp) - 1); + + r = recv(pair[1], buf, sizeof(buf), 0); + c_assert(r < 0); + c_assert(errno == EAGAIN); + + /* + * Send one packet before and one packet after modifying the map, + * verify that the modification applies at the time of send(), not recv(). + */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp), 0); + c_assert(r == sizeof(struct ether_arp)); + + r = n_acd_bpf_map_remove(mapfd, &ip1); + c_assert(r >= 0); + + r = send(pair[0], buf, sizeof(struct ether_arp), 0); + c_assert(r == sizeof(struct ether_arp)); + + r = recv(pair[1], buf, sizeof(buf), 0); + c_assert(r == sizeof(struct ether_arp)); + + r = recv(pair[1], buf, sizeof(buf), 0); + c_assert(r < 0); + c_assert(errno == EAGAIN); + + close(pair[0]); + close(pair[1]); + close(progfd); + close(mapfd); +} + +int main(int argc, char **argv) { + test_setup(); + + test_map(); + test_filter(); + + return 0; +} diff --git a/src/test-loopback.c b/src/test-loopback.c new file mode 100644 index 0000000000..0671cf6691 --- /dev/null +++ b/src/test-loopback.c @@ -0,0 +1,82 @@ +/* + * Test on loopback device + * This runs the ACD engine on the loopback device, effectively testing the BPF + * filter of ACD to discard its own packets. This might happen on + * non-spanning-tree networks, or on networks that echo packets. + */ + +#undef NDEBUG +#include +#include +#include "test.h" + +static void test_loopback(int ifindex, uint8_t *mac, size_t n_mac) { + NAcdConfig *config; + NAcd *acd; + struct pollfd pfds; + int r, fd; + + r = n_acd_config_new(&config); + c_assert(!r); + + n_acd_config_set_ifindex(config, ifindex); + n_acd_config_set_transport(config, N_ACD_TRANSPORT_ETHERNET); + n_acd_config_set_mac(config, mac, n_mac); + + r = n_acd_new(&acd, config); + c_assert(!r); + + n_acd_config_free(config); + + { + NAcdProbeConfig *probe_config; + NAcdProbe *probe; + struct in_addr ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }; + + r = n_acd_probe_config_new(&probe_config); + c_assert(!r); + + n_acd_probe_config_set_ip(probe_config, ip); + n_acd_probe_config_set_timeout(probe_config, 100); + + r = n_acd_probe(acd, &probe, probe_config); + c_assert(!r); + + n_acd_probe_config_free(probe_config); + + n_acd_get_fd(acd, &fd); + + for (;;) { + NAcdEvent *event; + pfds = (struct pollfd){ .fd = fd, .events = POLLIN }; + r = poll(&pfds, 1, -1); + c_assert(r >= 0); + + r = n_acd_dispatch(acd); + c_assert(!r); + + r = n_acd_pop_event(acd, &event); + c_assert(!r); + if (event) { + c_assert(event->event == N_ACD_EVENT_READY); + break; + } + } + + n_acd_probe_free(probe); + } + + n_acd_unref(acd); +} + +int main(int argc, char **argv) { + struct ether_addr mac; + int ifindex; + + test_setup(); + + test_loopback_up(&ifindex, &mac); + test_loopback(ifindex, mac.ether_addr_octet, sizeof(mac.ether_addr_octet)); + + return 0; +} diff --git a/src/test-twice.c b/src/test-twice.c new file mode 100644 index 0000000000..b474502ee3 --- /dev/null +++ b/src/test-twice.c @@ -0,0 +1,97 @@ +/* + * Test with unused address twice in parallel + * This runs the ACD engine with an unused address on a veth pair, but it runs + * it on both ends. We expect the PROBE to fail on at least one of the devices. + */ + +#undef NDEBUG +#include +#include +#include "test.h" + +static void test_unused(int ifindex1, uint8_t *mac1, size_t n_mac1, int ifindex2, uint8_t *mac2, size_t n_mac2) { + NAcdConfig config1 = { + .ifindex = ifindex1, + .transport = N_ACD_TRANSPORT_ETHERNET, + .mac = mac1, + .n_mac = n_mac1, + .ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }, + .timeout_msec = 100, + }; + NAcdConfig config2 = { + .ifindex = ifindex2, + .transport = N_ACD_TRANSPORT_ETHERNET, + .mac = mac2, + .n_mac = n_mac2, + .ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }, + .timeout_msec = 100, + }; + struct pollfd pfds[2]; + NAcd *acd1, *acd2; + int r, fd1, fd2, state1, state2; + + r = n_acd_new(&acd1); + c_assert(!r); + r = n_acd_new(&acd2); + c_assert(!r); + + n_acd_get_fd(acd1, &fd1); + n_acd_get_fd(acd2, &fd2); + + r = n_acd_start(acd1, &config1); + c_assert(!r); + r = n_acd_start(acd2, &config2); + c_assert(!r); + + for (state1 = state2 = -1; state1 == -1 || state2 == -1; ) { + NAcdEvent *event; + pfds[0] = (struct pollfd){ .fd = fd1, .events = (state1 == -1) ? POLLIN : 0 }; + pfds[1] = (struct pollfd){ .fd = fd2, .events = (state2 == -1) ? POLLIN : 0 }; + + r = poll(pfds, sizeof(pfds) / sizeof(*pfds), -1); + c_assert(r >= 0); + + if (state1 == -1) { + r = n_acd_dispatch(acd1); + c_assert(!r); + + r = n_acd_pop_event(acd1, &event); + if (!r) { + c_assert(event->event == N_ACD_EVENT_READY || event->event == N_ACD_EVENT_USED); + state1 = !!(event->event == N_ACD_EVENT_READY); + } else { + c_assert(r == N_ACD_E_DONE); + } + } + + if (state2 == -1) { + r = n_acd_dispatch(acd2); + c_assert(!r); + + r = n_acd_pop_event(acd2, &event); + if (!r) { + c_assert(event->event == N_ACD_EVENT_READY || event->event == N_ACD_EVENT_USED); + state2 = !!(event->event == N_ACD_EVENT_READY); + } else { + c_assert(r == N_ACD_E_DONE); + } + } + } + + n_acd_free(acd1); + n_acd_free(acd2); + + c_assert(!state1 || !state2); +} + +int main(int argc, char **argv) { + struct ether_addr mac1, mac2; + int ifindex1, ifindex2; + + test_setup(); + + test_veth_new(&ifindex1, &mac1, &ifindex2, &mac2); + test_unused(ifindex1, mac1.ether_addr_octet, sizeof(mac2.ether_addr_octet), ifindex2, mac2.ether_addr_octet, sizeof(mac2.ether_addr_octet)); + + return 0; +} diff --git a/src/test-unplug.c b/src/test-unplug.c new file mode 100644 index 0000000000..9ad88a9189 --- /dev/null +++ b/src/test-unplug.c @@ -0,0 +1,84 @@ +/* + * Unplug device during test run + * Run the ACD engine with an address that is not used by anyone else on the + * link, but DOWN or UNPLUG the device while running. + */ + +#undef NDEBUG +#include +#include +#include "test.h" + +static void test_unplug_down(int ifindex, uint8_t *mac, size_t n_mac, unsigned int run) { + NAcdConfig config = { + .ifindex = ifindex, + .transport = N_ACD_TRANSPORT_ETHERNET, + .mac = mac, + .n_mac = n_mac, + .ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }, + .timeout_msec = 100, + }; + struct pollfd pfds; + NAcd *acd; + int r, fd; + + if (!run--) + test_veth_cmd(ifindex, "down"); + + r = n_acd_new(&acd); + c_assert(!r); + + if (!run--) + test_veth_cmd(ifindex, "down"); + + n_acd_get_fd(acd, &fd); + r = n_acd_start(acd, &config); + c_assert(!r); + + if (!run--) + test_veth_cmd(ifindex, "down"); + + for (;;) { + NAcdEvent *event; + pfds = (struct pollfd){ .fd = fd, .events = POLLIN }; + r = poll(&pfds, 1, -1); + c_assert(r >= 0); + + if (!run--) + test_veth_cmd(ifindex, "down"); + + r = n_acd_dispatch(acd); + c_assert(!r); + + r = n_acd_pop_event(acd, &event); + if (!r) { + if (event->event == N_ACD_EVENT_DOWN) { + break; + } else { + c_assert(event->event == N_ACD_EVENT_READY); + test_veth_cmd(ifindex, "down"); + } + } else { + c_assert(r == N_ACD_E_DONE); + } + } + + n_acd_free(acd); +} + +int main(int argc, char **argv) { + struct ether_addr mac; + unsigned int i; + int ifindex; + + test_setup(); + + test_veth_new(&ifindex, &mac, NULL, NULL); + + for (i = 0; i < 5; ++i) { + test_unplug_down(ifindex, mac.ether_addr_octet, sizeof(mac.ether_addr_octet), i); + test_veth_cmd(ifindex, "up"); + } + + return 0; +} diff --git a/src/test-unused.c b/src/test-unused.c new file mode 100644 index 0000000000..67ec2e4cee --- /dev/null +++ b/src/test-unused.c @@ -0,0 +1,63 @@ +/* + * Test with unused address + * Run the ACD engine with an address that is not used by anyone else on the + * link. This should just pass through, with a short, random timeout. + */ + +#undef NDEBUG +#include +#include +#include "test.h" + +static void test_unused(int ifindex, const uint8_t *mac, size_t n_mac) { + NAcdConfig config = { + .ifindex = ifindex, + .transport = N_ACD_TRANSPORT_ETHERNET, + .mac = mac, + .n_mac = n_mac, + .ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }, + .timeout_msec = 100, + }; + struct pollfd pfds; + NAcd *acd; + int r, fd; + + r = n_acd_new(&acd); + c_assert(!r); + + n_acd_get_fd(acd, &fd); + r = n_acd_start(acd, &config); + c_assert(!r); + + for (;;) { + NAcdEvent *event; + pfds = (struct pollfd){ .fd = fd, .events = POLLIN }; + r = poll(&pfds, 1, -1); + c_assert(r >= 0); + + r = n_acd_dispatch(acd); + c_assert(!r); + + r = n_acd_pop_event(acd, &event); + if (!r) { + c_assert(event->event == N_ACD_EVENT_READY); + break; + } else { + c_assert(r == N_ACD_E_DONE); + } + } + + n_acd_free(acd); +} + +int main(int argc, char **argv) { + struct ether_addr mac; + int ifindex; + + test_setup(); + + test_veth_new(&ifindex, &mac, NULL, NULL); + test_unused(ifindex, mac.ether_addr_octet, sizeof(mac.ether_addr_octet)); + + return 0; +} diff --git a/src/test-veth.c b/src/test-veth.c new file mode 100644 index 0000000000..d19236838b --- /dev/null +++ b/src/test-veth.c @@ -0,0 +1,240 @@ +/* + * Test on a veth link + * + * This essentially mimics a real network with two peers. + * + * Run one ACD context on each end of the tunnel. On one end probe for N, + * addresses on the other end pre-configure N/3 of the same addresses and probe + * for another N/3 of the addresses. + * + * Verify that in the case of simultaneous probes of the same address at most one + * succeed, in the case of probing for a configured address it always fails, and + * probing for a non-existent address always succeeds. + * + * Make sure to keep N fairly high as the protocol is probabilistic, and we also + * want to verify that resizing the internal maps works correctly. + */ + +#undef NDEBUG +#include +#include +#include "test.h" + +#define TEST_ACD_N_PROBES (9) + +typedef enum { + TEST_ACD_STATE_UNKNOWN, + TEST_ACD_STATE_USED, + TEST_ACD_STATE_READY, +} TestAcdState; + +static void test_veth(int ifindex1, uint8_t *mac1, size_t n_mac1, + int ifindex2, uint8_t *mac2, size_t n_mac2) { + NAcdConfig *config; + NAcd *acd1, *acd2; + NAcdProbe *probes1[TEST_ACD_N_PROBES]; + NAcdProbe *probes2[TEST_ACD_N_PROBES]; + unsigned long state1, state2; + size_t n_running = 0; + int r; + + r = n_acd_config_new(&config); + c_assert(!r); + + n_acd_config_set_transport(config, N_ACD_TRANSPORT_ETHERNET); + + n_acd_config_set_ifindex(config, ifindex1); + n_acd_config_set_mac(config, mac1, n_mac1); + r = n_acd_new(&acd1, config); + c_assert(!r); + + n_acd_config_set_ifindex(config, ifindex2); + n_acd_config_set_mac(config, mac2, n_mac2); + r = n_acd_new(&acd2, config); + c_assert(!r); + + n_acd_config_free(config); + + { + NAcdProbeConfig *probe_config; + + r = n_acd_probe_config_new(&probe_config); + c_assert(!r); + n_acd_probe_config_set_timeout(probe_config, 1024); + + c_assert(TEST_ACD_N_PROBES <= 10 << 24); + + for (size_t i = 0; i < TEST_ACD_N_PROBES; ++i) { + struct in_addr ip = { htobe32((10 << 24) | i) }; + + n_acd_probe_config_set_ip(probe_config, ip); + + switch (i % 3) { + case 0: + /* + * Probe on one side, and leave the address + * unset on the other. The probe must succeed. + */ + break; + case 1: + /* + * Preconfigure the address on one side, and + * probe on the other. The probe must fail. + */ + test_add_child_ip(&ip); + break; + case 2: + /* + * Probe both sides for the same address, at + * most one may succeed. + */ + + r = n_acd_probe(acd2, &probes2[i], probe_config); + c_assert(!r); + + ++n_running; + break; + default: + c_assert(0); + abort(); + break; + } + + r = n_acd_probe(acd1, &probes1[i], probe_config); + c_assert(!r); + + ++n_running; + } + + n_acd_probe_config_free(probe_config); + + while (n_running > 0) { + NAcdEvent *event; + struct pollfd pfds[2] = { + { .events = POLLIN }, + { .events = POLLIN }, + }; + + n_acd_get_fd(acd1, &pfds[0].fd); + n_acd_get_fd(acd2, &pfds[1].fd); + + r = poll(pfds, 2, -1); + c_assert(r >= 0); + + if (pfds[0].revents & POLLIN) { + r = n_acd_dispatch(acd1); + c_assert(!r || r == N_ACD_E_PREEMPTED); + + for (;;) { + r = n_acd_pop_event(acd1, &event); + c_assert(!r); + if (event) { + switch (event->event) { + case N_ACD_EVENT_READY: + n_acd_probe_get_userdata(event->ready.probe, (void**)&state1); + c_assert(state1 == TEST_ACD_STATE_UNKNOWN); + state1 = TEST_ACD_STATE_READY; + n_acd_probe_set_userdata(event->ready.probe, (void*)state1); + + break; + case N_ACD_EVENT_USED: + n_acd_probe_get_userdata(event->used.probe, (void**)&state1); + c_assert(state1 == TEST_ACD_STATE_UNKNOWN); + state1 = TEST_ACD_STATE_USED; + n_acd_probe_set_userdata(event->used.probe, (void*)state1); + + break; + default: + c_assert(0); + } + + --n_running; + } else { + break; + } + } + } + + if (pfds[1].revents & POLLIN) { + r = n_acd_dispatch(acd2); + c_assert(!r || r == N_ACD_E_PREEMPTED); + + for (;;) { + r = n_acd_pop_event(acd2, &event); + c_assert(!r); + if (event) { + switch (event->event) { + case N_ACD_EVENT_READY: + n_acd_probe_get_userdata(event->ready.probe, (void**)&state2); + c_assert(state2 == TEST_ACD_STATE_UNKNOWN); + state2 = TEST_ACD_STATE_READY; + n_acd_probe_set_userdata(event->ready.probe, (void*)state2); + + break; + case N_ACD_EVENT_USED: + n_acd_probe_get_userdata(event->used.probe, (void**)&state2); + c_assert(state2 == TEST_ACD_STATE_UNKNOWN); + state2 = TEST_ACD_STATE_USED; + n_acd_probe_set_userdata(event->used.probe, (void*)state2); + + break; + default: + c_assert(0); + } + + --n_running; + } else { + break; + } + } + } + } + + for (size_t i = 0; i < TEST_ACD_N_PROBES; ++i) { + struct in_addr ip = { htobe32((10 << 24) | i) }; + + switch (i % 3) { + case 0: + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + c_assert(state1 == TEST_ACD_STATE_READY); + + break; + case 1: + test_del_child_ip(&ip); + + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + c_assert(state1 == TEST_ACD_STATE_USED); + + break; + case 2: + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + n_acd_probe_get_userdata(probes2[i], (void **)&state2); + c_assert(state1 != TEST_ACD_STATE_UNKNOWN); + c_assert(state2 != TEST_ACD_STATE_UNKNOWN); + c_assert(state1 == TEST_ACD_STATE_USED || state2 == TEST_ACD_STATE_USED); + n_acd_probe_free(probes2[i]); + + break; + } + n_acd_probe_free(probes1[i]); + } + } + + n_acd_unref(acd2); + n_acd_unref(acd1); +} + +int main(int argc, char **argv) { + struct ether_addr mac1, mac2; + int ifindex1, ifindex2; + + test_setup(); + + test_veth_new(&ifindex1, &mac1, &ifindex2, &mac2); + for (unsigned int i = 0; i < 8; ++i) { + test_veth(ifindex1, mac1.ether_addr_octet, sizeof(mac1.ether_addr_octet), + ifindex2, mac2.ether_addr_octet, sizeof(mac2.ether_addr_octet)); + } + + return 0; +} diff --git a/src/test.h b/src/test.h new file mode 100644 index 0000000000..69a786a013 --- /dev/null +++ b/src/test.h @@ -0,0 +1,213 @@ +#pragma once + +/* + * Test Helpers + * Bunch of helpers to setup the environment for networking tests. This + * includes net-namespace setups, veth setups, and more. + */ + +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" + +static inline void test_add_child_ip(const struct in_addr *addr) { + char *p; + int r; + + r = asprintf(&p, "ip addr add dev veth1 %s/8", inet_ntoa(*addr)); + c_assert(r >= 0); + + r = system(p); + c_assert(r >= 0); + + free(p); +} + +static inline void test_del_child_ip(const struct in_addr *addr) { + char *p; + int r; + + r = asprintf(&p, "ip addr del dev veth1 %s/8", inet_ntoa(*addr)); + c_assert(r >= 0); + + r = system(p); + c_assert(r >= 0); + + free(p); +} + +static inline void test_if_query(const char *name, int *indexp, struct ether_addr *macp) { + struct ifreq ifr = {}; + size_t l; + int r, s; + + l = strlen(name); + c_assert(l <= IF_NAMESIZE); + + if (indexp) { + *indexp = if_nametoindex(name); + c_assert(*indexp > 0); + } + + if (macp) { + s = socket(AF_INET, SOCK_DGRAM, 0); + c_assert(s >= 0); + + strncpy(ifr.ifr_name, name, l + 1); + r = ioctl(s, SIOCGIFHWADDR, &ifr); + c_assert(r >= 0); + + memcpy(macp->ether_addr_octet, ifr.ifr_hwaddr.sa_data, ETH_ALEN); + + close(s); + } +} + +static inline void test_veth_cmd(int ifindex, const char *cmd) { + char *p, name[IF_NAMESIZE + 1] = {}; + int r; + + p = if_indextoname(ifindex, name); + c_assert(p); + + r = asprintf(&p, "ip link set %s %s", name, cmd); + c_assert(r >= 0); + + /* Again: Ewwww... */ + r = system(p); + c_assert(r == 0); + + free(p); +} + +static inline void test_veth_new(int *parent_indexp, + struct ether_addr *parent_macp, + int *child_indexp, + struct ether_addr *child_macp) { + int r; + + /* Eww... but it works. */ + r = system("ip link add type veth"); + c_assert(r == 0); + r = system("ip link set veth0 up"); + c_assert(r == 0); + r = system("ip link set veth1 up"); + c_assert(r == 0); + + test_if_query("veth0", parent_indexp, parent_macp); + test_if_query("veth1", child_indexp, child_macp); +} + +static inline void test_loopback_up(int *indexp, struct ether_addr *macp) { + int r; + + r = system("ip link set lo up"); + c_assert(r == 0); + + test_if_query("lo", indexp, macp); +} + +static inline void test_raise_memlock(void) { + const size_t wanted = 64 * 1024 * 1024; + struct rlimit get, set; + int r; + + r = getrlimit(RLIMIT_MEMLOCK, &get); + c_assert(!r); + + /* try raising limit to @wanted */ + set.rlim_cur = wanted; + set.rlim_max = (wanted > get.rlim_max) ? wanted : get.rlim_max; + r = setrlimit(RLIMIT_MEMLOCK, &set); + if (r) { + c_assert(errno == EPERM); + + /* not privileged to raise limit, so maximize soft limit */ + set.rlim_cur = get.rlim_max; + set.rlim_max = get.rlim_max; + r = setrlimit(RLIMIT_MEMLOCK, &set); + c_assert(!r); + } +} + +static inline void test_unshare_user_namespace(void) { + uid_t euid; + gid_t egid; + int r, fd; + + /* + * Enter a new user namespace as root:root. + */ + + euid = geteuid(); + egid = getegid(); + + r = unshare(CLONE_NEWUSER); + c_assert(r >= 0); + + fd = open("/proc/self/uid_map", O_WRONLY); + c_assert(fd >= 0); + r = dprintf(fd, "0 %d 1\n", euid); + c_assert(r >= 0); + close(fd); + + fd = open("/proc/self/setgroups", O_WRONLY); + c_assert(fd >= 0); + r = dprintf(fd, "deny"); + c_assert(r >= 0); + close(fd); + + fd = open("/proc/self/gid_map", O_WRONLY); + c_assert(fd >= 0); + r = dprintf(fd, "0 %d 1\n", egid); + c_assert(r >= 0); + close(fd); +} + +static inline void test_setup(void) { + int r; + + /* + * Move into a new network and mount namespace both associated + * with a new user namespace where the current eUID is mapped to + * 0. Then create a private instance of /run/netns. This ensures + * that any network devices or network namespaces are private to + * the test process. + */ + + test_raise_memlock(); + test_unshare_user_namespace(); + + r = unshare(CLONE_NEWNET | CLONE_NEWNS); + c_assert(r >= 0); + + r = mount(NULL, "/", "", MS_PRIVATE | MS_REC, NULL); + c_assert(r >= 0); + + r = mount(NULL, "/run", "tmpfs", 0, NULL); + c_assert(r >= 0); + + r = mkdir("/run/netns", 0755); + c_assert(r >= 0); +} diff --git a/src/util/test-timer.c b/src/util/test-timer.c new file mode 100644 index 0000000000..a0c908bd4a --- /dev/null +++ b/src/util/test-timer.c @@ -0,0 +1,177 @@ +/* + * Tests for timer utility library + */ + +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include "timer.h" + +#define N_TIMEOUTS (10000) + +static void test_api(void) { + Timer timer = TIMER_NULL(timer); + Timeout t1 = TIMEOUT_INIT(t1), t2 = TIMEOUT_INIT(t2), *t; + int r; + + r = timer_init(&timer); + c_assert(!r); + + timeout_schedule(&t1, &timer, 1); + timeout_schedule(&t2, &timer, 2); + + r = timer_pop_timeout(&timer, 10, &t); + c_assert(!r); + c_assert(t == &t1); + + timeout_unschedule(&t2); + + r = timer_pop_timeout(&timer, 10, &t); + c_assert(!r); + c_assert(!t); + + timer_deinit(&timer); +} + +static void test_pop(void) { + Timer timer = TIMER_NULL(timer); + Timeout timeouts[N_TIMEOUTS] = {}; + uint64_t times[N_TIMEOUTS] = {}; + size_t n_timeouts = 0; + bool armed; + Timeout *t; + int r; + + r = timer_init(&timer); + c_assert(!r); + + for(size_t i = 0; i < N_TIMEOUTS; ++i) { + timeouts[i] = (Timeout)TIMEOUT_INIT(timeouts[i]); + times[i] = rand() % 128 + 1; + timeout_schedule(&timeouts[i], &timer, times[i]); + } + + armed = true; + + for(size_t i = 0; i <= 128; ++i) { + if (armed) { + struct pollfd pfd = { + .fd = timer.fd, + .events = POLLIN, + }; + uint64_t count; + + r = poll(&pfd, 1, -1); + c_assert(r == 1); + + r = read(timer.fd, &count, sizeof(count)); + c_assert(r == sizeof(count)); + c_assert(count == 1); + armed = false; + } + + for (;;) { + uint64_t current_time; + + r = timer_pop_timeout(&timer, i, &t); + c_assert(!r); + if (!t) { + timer_rearm(&timer); + break; + } + + current_time = times[t - timeouts]; + c_assert(current_time == i); + ++n_timeouts; + armed = true; + } + } + + c_assert(n_timeouts == N_TIMEOUTS); + + r = timer_pop_timeout(&timer, (uint64_t)-1, &t); + c_assert(!r); + c_assert(!t); + + timer_deinit(&timer); +} + +void test_arm(void) { + struct itimerspec spec = { + .it_value = { + .tv_sec = 1000, + }, + }; + int fd1, fd2, r; + + fd1 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + c_assert(fd1 >= 0); + + fd2 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + c_assert(fd1 >= 0); + + r = timerfd_settime(fd1, 0, &spec, NULL); + c_assert(r >= 0); + + r = timerfd_settime(fd2, 0, &spec, NULL); + c_assert(r >= 0); + + r = timerfd_gettime(fd1, &spec); + c_assert(r >= 0); + c_assert(spec.it_value.tv_sec); + + r = timerfd_gettime(fd2, &spec); + c_assert(r >= 0); + c_assert(spec.it_value.tv_sec); + + spec = (struct itimerspec){}; + + r = timerfd_settime(fd1, 0, &spec, NULL); + c_assert(r >= 0); + + r = timerfd_gettime(fd1, &spec); + c_assert(r >= 0); + c_assert(!spec.it_value.tv_sec); + c_assert(!spec.it_value.tv_nsec); + + r = timerfd_gettime(fd2, &spec); + c_assert(r >= 0); + c_assert(spec.it_value.tv_sec); + + spec = (struct itimerspec){ .it_value = { .tv_nsec = 1, }, }; + + r = timerfd_settime(fd1, 0, &spec, NULL); + c_assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd1, .events = POLLIN }, 1, -1); + c_assert(r == 1); + + r = timerfd_settime(fd2, 0, &spec, NULL); + c_assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd2, .events = POLLIN }, 1, -1); + c_assert(r == 1); + + spec = (struct itimerspec){}; + + r = timerfd_settime(fd1, 0, &spec, NULL); + c_assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd2, .events = POLLIN }, 1, -1); + c_assert(r == 1); + + close(fd2); + close(fd1); +} + +int main(int argc, char **argv) { + test_arm(); + test_api(); + test_pop(); + return 0; +} diff --git a/src/util/timer.c b/src/util/timer.c new file mode 100644 index 0000000000..af2a887cea --- /dev/null +++ b/src/util/timer.c @@ -0,0 +1,189 @@ +/* + * Timer Utility Library + */ + +#include +#include +#include +#include +#include +#include +#include +#include "timer.h" + +int timer_init(Timer *timer) { + clockid_t clock = CLOCK_BOOTTIME; + int r; + + r = timerfd_create(clock, TFD_CLOEXEC | TFD_NONBLOCK); + if (r < 0 && errno == EINVAL) { + clock = CLOCK_MONOTONIC; + r = timerfd_create(clock, TFD_CLOEXEC | TFD_NONBLOCK); + } + if (r < 0) + return -errno; + + *timer = (Timer)TIMER_NULL(*timer); + timer->fd = r; + timer->clock = clock; + + return 0; +} + +void timer_deinit(Timer *timer) { + c_assert(c_rbtree_is_empty(&timer->tree)); + + if (timer->fd >= 0) { + close(timer->fd); + timer->fd = -1; + } +} + +void timer_now(Timer *timer, uint64_t *nowp) { + struct timespec ts; + int r; + + r = clock_gettime(timer->clock, &ts); + c_assert(r >= 0); + + *nowp = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +} + +void timer_rearm(Timer *timer) { + uint64_t time; + Timeout *timeout; + int r; + + /* + * A timeout value of 0 clears the timer, we should only set that if + * no timeout exists in the tree. + */ + + timeout = c_rbnode_entry(c_rbtree_first(&timer->tree), Timeout, node); + c_assert(!timeout || timeout->timeout); + + time = timeout ? timeout->timeout : 0; + + if (time != timer->scheduled_timeout) { + r = timerfd_settime(timer->fd, + TFD_TIMER_ABSTIME, + &(struct itimerspec){ + .it_value = { + .tv_sec = time / UINT64_C(1000000000), + .tv_nsec = time % UINT64_C(1000000000), + }, + }, + NULL); + c_assert(r >= 0); + + timer->scheduled_timeout = time; + } +} + +int timer_read(Timer *timer) { + uint64_t v; + int r; + + r = read(timer->fd, &v, sizeof(v)); + if (r < 0) { + if (errno == EAGAIN) { + /* + * No more pending events. + */ + return 0; + } else { + /* + * Something failed. We use CLOCK_BOOTTIME/MONOTONIC, + * so ECANCELED cannot happen. Hence, there is no + * error that we could gracefully handle. Fail hard + * and let the caller deal with it. + */ + return -errno; + } + } else if (r != sizeof(v) || v == 0) { + /* + * Kernel guarantees 8-byte reads, and only to return + * data if at least one timer triggered; fail hard if + * it suddenly starts doing weird shit. + */ + return -EIO; + } + + return TIMER_E_TRIGGERED; +} + + +int timer_pop_timeout(Timer *timer, uint64_t until, Timeout **timeoutp) { + Timeout *timeout; + + /* + * If the first timeout is scheduled before @until, then unlink + * it and return it. Otherwise, return NULL. + */ + timeout = c_rbnode_entry(c_rbtree_first(&timer->tree), Timeout, node); + if (timeout && timeout->timeout <= until) { + c_rbnode_unlink(&timeout->node); + timeout->timeout = 0; + *timeoutp = timeout; + } else { + *timeoutp = NULL; + } + + return 0; +} + +void timeout_schedule(Timeout *timeout, Timer *timer, uint64_t time) { + c_assert(time); + + /* + * In case @timeout was already scheduled, remove it from the + * tree. If we are moving it to a new timer, rearm the old one. + */ + if (timeout->timer) { + c_rbnode_unlink(&timeout->node); + if (timeout->timer != timer) + timer_rearm(timeout->timer); + } + timeout->timer = timer; + timeout->timeout = time; + + /* + * Now insert it back into the tree in the correct new position. + * We allow duplicates in the tree, so this insertion is open-coded. + */ + { + Timeout *other; + CRBNode **slot, *parent; + + slot = &timer->tree.root; + parent = NULL; + while (*slot) { + other = c_rbnode_entry(*slot, Timeout, node); + parent = *slot; + if (timeout->timeout < other->timeout) + slot = &(*slot)->left; + else + slot = &(*slot)->right; + } + + c_rbtree_add(&timer->tree, parent, slot, &timeout->node); + } + + /* + * Rearm the timer as we updated the timeout tree. + */ + timer_rearm(timer); +} + +void timeout_unschedule(Timeout *timeout) { + Timer *timer = timeout->timer; + + if (!timer) + return; + + c_rbnode_unlink(&timeout->node); + timeout->timeout = 0; + timeout->timer = NULL; + + timer_rearm(timer); +} diff --git a/src/util/timer.h b/src/util/timer.h new file mode 100644 index 0000000000..d01b27414b --- /dev/null +++ b/src/util/timer.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +typedef struct Timer Timer; +typedef struct Timeout Timeout; + +enum { + _TIMER_E_SUCCESS, + + TIMER_E_TRIGGERED, + + _TIMER_E_N, +}; + +struct Timer { + int fd; + clockid_t clock; + CRBTree tree; + uint64_t scheduled_timeout; +}; + +#define TIMER_NULL(_x) { \ + .fd = -1, \ + .tree = C_RBTREE_INIT, \ + } + +struct Timeout { + Timer *timer; + CRBNode node; + uint64_t timeout; +}; + +#define TIMEOUT_INIT(_x) { \ + .node = C_RBNODE_INIT((_x).node), \ + } + +int timer_init(Timer *timer); +void timer_deinit(Timer *timer); + +void timer_now(Timer *timer, uint64_t *nowp); + +int timer_pop_timeout(Timer *timer, uint64_t now, Timeout **timerp); +void timer_rearm(Timer *timer); +int timer_read(Timer *timer); + +void timeout_schedule(Timeout *timeout, Timer *timer, uint64_t time); +void timeout_unschedule(Timeout *timeout); + diff --git a/subprojects/c-list b/subprojects/c-list new file mode 160000 index 0000000000..6c53ef1c00 --- /dev/null +++ b/subprojects/c-list @@ -0,0 +1 @@ +Subproject commit 6c53ef1c0066a3b0d82e9e181e90114eacb7c4aa diff --git a/subprojects/c-rbtree b/subprojects/c-rbtree new file mode 160000 index 0000000000..c8cf175278 --- /dev/null +++ b/subprojects/c-rbtree @@ -0,0 +1 @@ +Subproject commit c8cf175278452686cc5993e154d472d0a64d7fac diff --git a/subprojects/c-siphash b/subprojects/c-siphash new file mode 160000 index 0000000000..2d159c7da1 --- /dev/null +++ b/subprojects/c-siphash @@ -0,0 +1 @@ +Subproject commit 2d159c7da1d542f2b1fcbbefea6931bce242b943 diff --git a/subprojects/c-stdaux b/subprojects/c-stdaux new file mode 160000 index 0000000000..8b8f941c57 --- /dev/null +++ b/subprojects/c-stdaux @@ -0,0 +1 @@ +Subproject commit 8b8f941c57a790c277f49b099e73ed9f8ea141af