diff --git a/.cherryci/ci-test b/.cherryci/ci-test new file mode 100755 index 0000000000..71f3457037 --- /dev/null +++ b/.cherryci/ci-test @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +rm -Rf "./ci-build" +mkdir "./ci-build" +cd "./ci-build" + +${CHERRY_LIB_MESONSETUP} . "${CHERRY_LIB_SRCDIR}" ${N_ACD_CONF} +${CHERRY_LIB_NINJABUILD} +sudo ${CHERRY_LIB_MESONTEST} +# no valgrind tests, since bpf(2) is not supported by it diff --git a/.cherryci/matrix b/.cherryci/matrix new file mode 100755 index 0000000000..0b5da37c74 --- /dev/null +++ b/.cherryci/matrix @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +CHERRY_MATRIX+=("export N_ACD_CONF=-Debpf=false ${CHERRY_LIB_M_DEFAULT[*]}") diff --git a/.editorconfig b/.editorconfig index b41176962d..b10bb4f3f8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,16 +1,11 @@ -# http://EditorConfig.org - -# top-most EditorConfig file root = true -# Unix-style newlines with a newline ending every file, utf-8 charset [*] end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true charset = utf-8 -# match config files, set indent to spaces with width of eight [*.{c,h}] indent_style = space indent_size = 8 diff --git a/.gitmodules b/.gitmodules index ec8b866d2f..d73d05a267 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "subprojects/c-siphash"] path = subprojects/c-siphash url = https://github.com/c-util/c-siphash.git +[submodule "subprojects/c-rbtree"] + path = subprojects/c-rbtree + url = https://github.com/c-util/c-rbtree.git diff --git a/.travis.yml b/.travis.yml index ed0bcf38c4..99a7bb9461 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,18 +1,21 @@ -dist: trusty -sudo: required os: linux +dist: trusty language: c -compiler: - - gcc - - clang -install: - - curl -L "https://github.com/ninja-build/ninja/releases/download/v1.7.2/ninja-linux.zip" -o "ninja-linux.zip" - - sudo unzip "ninja-linux.zip" -d "/usr/local/bin" - - sudo chmod 755 "/usr/local/bin/ninja" - - pip3 install meson +services: + - docker -script: - - meson "build" - - ninja -C "build" - - sudo MESON_TESTTHREADS=64 ninja -C "build" test +before_install: + - curl -O -L "https://raw.githubusercontent.com/cherry-pick/cherry-images/v1/scripts/vmrun" + - curl -O -L "https://raw.githubusercontent.com/cherry-pick/cherry-ci/v1/scripts/cherryci" + - chmod +x "./vmrun" "./cherryci" + +jobs: + include: + - stage: test + script: + - ./vmrun -- ../src/cherryci -d ../src/.cherryci -s c-util -m + - script: + - ./vmrun -T armv7hl -- ../src/cherryci -d ../src/.cherryci -s c-util + - script: + - ./vmrun -T i686 -- ../src/cherryci -d ../src/.cherryci -s c-util diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000..89ee27d233 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,39 @@ +LICENSE: + This project is dual-licensed under both the Apache License, Version + 2.0, and the GNU Lesser General Public License, Version 2.1+. + +AUTHORS-ASL: + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +AUTHORS-LGPL: + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; If not, see . + +COPYRIGHT: (ordered alphabetically) + Copyright (C) 2015-2018 Red Hat, Inc. + +AUTHORS: (ordered alphabetically) + Beniamino Galvani + David Herrmann + Thomas Haller + Tom Gundersen diff --git a/AUTHORS-ASL b/AUTHORS-ASL new file mode 100644 index 0000000000..5d501a7284 --- /dev/null +++ b/AUTHORS-ASL @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright {yyyy} {name of copyright owner} + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/AUTHORS-LGPL b/AUTHORS-LGPL new file mode 100644 index 0000000000..4362b49151 --- /dev/null +++ b/AUTHORS-LGPL @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/COPYING b/COPYING deleted file mode 100644 index 81c0566b88..0000000000 --- a/COPYING +++ /dev/null @@ -1,19 +0,0 @@ -LICENSE: - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -COPYRIGHT: (ordered alphabetically) - Copyright (C) 2015-2017 Red Hat, Inc. - -AUTHORS: (ordered alphabetically) - David Herrmann - Tom Gundersen diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 5d501a7284..0000000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - -Copyright {yyyy} {name of copyright owner} - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/LICENSE b/LICENSE new file mode 120000 index 0000000000..da24c5e4a6 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +AUTHORS-ASL \ No newline at end of file diff --git a/NEWS b/NEWS new file mode 100644 index 0000000000..bb06abbc5a --- /dev/null +++ b/NEWS @@ -0,0 +1,21 @@ +n-acd - IPv4 Address Conflict Detection + +CHANGES WITH 1: + + * Initial release of n-acd. This project implements the IPv4 Address + Conflict Detection standard as defined in RFC-5227. The state machine + is implemented in a shared library and provides a stable ISO-C11 API. + The implementation is linux-only and relies heavily on the API + behavior of recent linux kernel releases. + + * Compared to the pre-releases, this release supports many parallel + probes on a single n-acd context. This reduces the number of + allocated network resources to O(1), based on the number of running + parallel probes. + + * The n-acd project is now dual-licensed: ASL-2.0 and LGPL-2.1+ + + Contributions from: Beniamino Galvani, David Herrmann, Thomas Haller, + Tom Gundersen + + - Tübingen, 2018-08-08 diff --git a/README b/README index 4077cba05e..b88d31c713 100644 --- a/README +++ b/README @@ -8,7 +8,7 @@ ABOUT: kernel releases. DETAILS: - https://github.com/nettools/n-acd/wiki + https://nettools.github.io/n-acd BUG REPORTS: https://github.com/nettools/n-acd/issues @@ -20,14 +20,18 @@ GIT: GITWEB: https://github.com/nettools/n-acd +MAILINGLIST: + https://groups.google.com/forum/#!forum/nettools-devel + LICENSE: - Apache Software License 2.0 (LICENSE) - See COPYING for details. + Apache Software License 2.0 + Lesser General Public License 2.1+ + See AUTHORS for details. REQUIREMENTS: The requirements for n-acd are: - Linux kernel >= 3.0 + Linux kernel >= 3.19 libc (e.g., glibc >= 2.16) At build-time, the following software is required: @@ -36,15 +40,15 @@ REQUIREMENTS: pkg-config >= 0.29 INSTALL: - The meson build-system is used for n-acd. Contact upstream + The meson build-system is used for this project. Contact upstream documentation for detailed help. In most situations the following - commands are sufficient to build and install n-acd from source: + commands are sufficient to build and install from source: $ mkdir build $ cd build - $ meson setup . .. + $ meson setup .. $ ninja - $ ninja test + $ meson test # ninja install - No custom configuration options are available. + For custom configuration options see meson_options.txt. diff --git a/meson.build b/meson.build index da923c288d..a05164c048 100644 --- a/meson.build +++ b/meson.build @@ -1,19 +1,25 @@ -project('n-acd', +project( + 'n-acd', 'c', version: '1', license: 'Apache', default_options: [ - 'buildtype=release', 'c_std=c11', - ]) + ], +) +project_description = 'IPv4 Address Conflict Detection' add_project_arguments('-D_GNU_SOURCE', language: 'c') mod_pkgconfig = import('pkgconfig') sub_clist = subproject('c-list') +sub_crbtree = subproject('c-rbtree') sub_csiphash = subproject('c-siphash') dep_clist = sub_clist.get_variable('libclist_dep') +dep_crbtree = sub_crbtree.get_variable('libcrbtree_dep') dep_csiphash = sub_csiphash.get_variable('libcsiphash_dep') +use_ebpf = get_option('ebpf') + subdir('src') diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000000..b024ee1d4c --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('ebpf', type: 'boolean', value: true, description: 'Enable eBPF packet filtering') diff --git a/src/libnacd.sym b/src/libnacd.sym index c9bd487533..f85e13acf9 100644 --- a/src/libnacd.sym +++ b/src/libnacd.sym @@ -1,13 +1,28 @@ -LIBNACD_1 { +LIBNACD_2 { global: + n_acd_config_new; + n_acd_config_free; + n_acd_config_set_ifindex; + n_acd_config_set_transport; + n_acd_config_set_mac; + + n_acd_probe_config_new; + n_acd_probe_config_free; + n_acd_probe_config_set_ip; + n_acd_probe_config_set_timeout; + n_acd_new; - n_acd_free; + n_acd_ref; + n_acd_unref; n_acd_get_fd; n_acd_dispatch; n_acd_pop_event; - n_acd_start; - n_acd_stop; - n_acd_announce; + n_acd_probe; + + n_acd_probe_free; + n_acd_probe_set_userdata; + n_acd_probe_get_userdata; + n_acd_probe_announce; local: *; }; diff --git a/src/meson.build b/src/meson.build index ba09d1323b..0a405f9c4d 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,76 +1,94 @@ # # target: libnacd.so -# We build both, a static and a shared library. We want our tests to get access -# to internals, so we link them statically. # -libnacd_private = static_library('nacd-private', - ['n-acd.c'], - c_args: [ - '-fvisibility=hidden', - '-fno-common' - ], - dependencies: [ - dep_clist, - dep_csiphash, - ], - pic: true) -install_headers('n-acd.h') libnacd_symfile = join_paths(meson.current_source_dir(), 'libnacd.sym') -libnacd_shared = shared_library('nacd', - dependencies: dep_csiphash, - objects: libnacd_private.extract_all_objects(), - install: true, - soversion: 0, - link_depends: libnacd_symfile, - link_args: [ - '-Wl,--no-undefined', - '-Wl,--version-script=@0@'.format(libnacd_symfile) - ]) -mod_pkgconfig.generate(libraries: libnacd_shared, - version: meson.project_version(), - name: 'libnacd', - filebase: 'libnacd', - description: 'IPv4 Address Conflict Detection') -# -# target: test-api -# The test-api program explicitly links against the shared library, since it -# tests for symbol visibility. -# +libnacd_deps = [ + dep_clist, + dep_crbtree, + dep_csiphash, +] -test_api = executable('test-api', - ['test-api.c'], - link_with: libnacd_shared) -test('API Symbol Visibility', test_api) +libnacd_sources = [ + 'n-acd.c', + 'n-acd-probe.c', + 'util/timer.c', +] + +if use_ebpf + libnacd_sources += [ + 'n-acd-bpf.c', + ] +else + libnacd_sources += [ + 'n-acd-bpf-fallback.c', + ] +endif + +libnacd_private = static_library( + 'nacd-private', + libnacd_sources, + c_args: [ + '-fvisibility=hidden', + '-fno-common' + ], + dependencies: libnacd_deps, + pic: true, +) + +libnacd_shared = shared_library( + 'nacd', + objects: libnacd_private.extract_all_objects(), + dependencies: libnacd_deps, + install: not meson.is_subproject(), + soversion: 0, + link_depends: libnacd_symfile, + link_args: [ + '-Wl,--no-undefined', + '-Wl,--version-script=@0@'.format(libnacd_symfile) + ], +) + +libnacd_dep = declare_dependency( + include_directories: include_directories('.'), + link_with: libnacd_private, + dependencies: libnacd_deps, + version: meson.project_version(), +) + +if not meson.is_subproject() + install_headers('n-acd.h') + + mod_pkgconfig.generate( + libraries: libnacd_shared, + version: meson.project_version(), + name: 'libnacd', + filebase: 'libnacd', + description: project_description, + ) +endif # # target: test-* -# All other tests are listed here. They link against the static library, so -# they can access internals for verification. # -test_basic = executable('test-basic', - ['test-basic.c'], - link_with: libnacd_private) -test('Basic API Behavior', test_basic) +test_api = executable('test-api', ['test-api.c'], link_with: libnacd_shared) +test('API Symbol Visibility', test_api) -test_loopback = executable('test-loopback', - ['test-loopback.c'], - link_with: libnacd_private) +if use_ebpf + test_bpf = executable('test-bpf', ['test-bpf.c'], dependencies: libnacd_dep) + test('eBPF socket filtering', test_bpf) +endif + +test_loopback = executable('test-loopback', ['test-loopback.c'], dependencies: libnacd_dep) test('Echo Suppression via Loopback', test_loopback) -test_twice = executable('test-twice', - ['test-twice.c'], - link_with: libnacd_private) -test('Two ACD in Parallel', test_twice) +test_timer = executable('test-timer', ['util/test-timer.c'], dependencies: libnacd_dep) +test('Timer helper', test_timer) -test_unplug = executable('test-unplug', - ['test-unplug.c'], - link_with: libnacd_private) -test('Async Interface Hotplug', test_unplug) +#test_unplug = executable('test-unplug', ['test-unplug.c'], dependencies: libnacd_dep) +#test('Async Interface Hotplug', test_unplug) -test_unused = executable('test-unsed', - ['test-unused.c'], - link_with: libnacd_private) -test('Unconflicted ACD', test_unused) +test_veth = executable('test-veth', ['test-veth.c'], dependencies: libnacd_dep) +test('Parallel ACD instances', test_veth) diff --git a/src/n-acd-bpf-fallback.c b/src/n-acd-bpf-fallback.c new file mode 100644 index 0000000000..5e6bdd0677 --- /dev/null +++ b/src/n-acd-bpf-fallback.c @@ -0,0 +1,29 @@ +/* + * A noop implementation of eBPF filter for IPv4 Address Conflict Detection + * + * These are a collection of dummy funcitons that have no effect, but allows + * n-acd to compile without eBPF support. + * + * See n-acd-bpf.c for documentation. + */ + +#include +#include "n-acd-private.h" + +int n_acd_bpf_map_create(int *mapfdp, size_t max_entries) { + *mapfdp = -1; + return 0; +} + +int n_acd_bpf_map_add(int mapfd, struct in_addr *addrp) { + return 0; +} + +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addrp) { + return 0; +} + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *macp) { + *progfdp = -1; + return 0; +} diff --git a/src/n-acd-bpf.c b/src/n-acd-bpf.c new file mode 100644 index 0000000000..771a28eeb2 --- /dev/null +++ b/src/n-acd-bpf.c @@ -0,0 +1,316 @@ +/* + * eBPF filter for IPv4 Address Conflict Detection + * + * An eBPF map and an eBPF program are provided. The map contains all the + * addresses address conflict detection is performed on, and the program + * filters out all packets except exactly the packets relevant to the ACD + * protocol on the addresses currently in the map. + * + * Note that userspace still has to filter the incoming packets, as filter + * are applied when packets are queued on the socket, not when userspace + * receives them. It is therefore possible to receive packets about addresses + * that have already been removed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd-private.h" + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = BPF_PSEUDO_MAP_FD, \ + .off = 0, \ + .imm = (__u32) (MAP_FD), \ + }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (MAP_FD)) >> 32, \ + }) + +#define BPF_ALU_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0, \ + }) + +#define BPF_ALU_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_MOV_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0, \ + }) + +#define BPF_MOV_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM, \ + }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0, \ + }) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM, \ + }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = FUNC, \ + }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0, \ + }) + +static int n_acd_syscall_bpf(int cmd, union bpf_attr *attr, unsigned int size) { + return (int)syscall(__NR_bpf, cmd, attr, size); +} + +int n_acd_bpf_map_create(int *mapfdp, size_t max_entries) { + union bpf_attr attr; + int mapfd; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(uint32_t), + .value_size = sizeof(uint8_t), /* values are never used, but must be set */ + .max_entries = max_entries, + }; + + mapfd = n_acd_syscall_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); + if (mapfd < 0) + return -errno; + + *mapfdp = mapfd; + return 0; +} + +int n_acd_bpf_map_add(int mapfd, struct in_addr *addrp) { + union bpf_attr attr; + uint32_t addr = be32toh(addrp->s_addr); + uint8_t _dummy = 0; + int r; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_fd = mapfd, + .key = (uint64_t)(unsigned long)&addr, + .value = (uint64_t)(unsigned long)&_dummy, + .flags = BPF_NOEXIST, + }; + + r = n_acd_syscall_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + if (r < 0) + return -errno; + + return 0; +} + +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addrp) { + uint32_t addr = be32toh(addrp->s_addr); + union bpf_attr attr; + int r; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .map_fd = mapfd, + .key = (uint64_t)(unsigned long)&addr, + }; + + r = n_acd_syscall_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + if (r < 0) + return -errno; + + return 0; +} + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *macp) { + const union { + uint8_t u8[6]; + uint16_t u16[3]; + uint32_t u32[1]; + } mac = { + .u8 = { + macp->ether_addr_octet[0], + macp->ether_addr_octet[1], + macp->ether_addr_octet[2], + macp->ether_addr_octet[3], + macp->ether_addr_octet[4], + macp->ether_addr_octet[5], + }, + }; + struct bpf_insn prog[] = { + /* for using BPF_LD_ABS r6 must point to the skb, currently in r1 */ + BPF_MOV_REG(6, 1), /* r6 = r1 */ + + /* drop the packet if it is too short */ + BPF_LDX_MEM(BPF_W, 0, 6, offsetof(struct __sk_buff, len)), /* r0 = skb->len */ + BPF_JMP_IMM(BPF_JGE, 0, sizeof(struct ether_arp), 2), /* if (r0 >= sizeof(ether_arp)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* drop the packet if the header is not as expected */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_hrd)), /* r0 = header type */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPHRD_ETHER, 2), /* if (r0 == ethernet) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_pro)), /* r0 = protocol */ + BPF_JMP_IMM(BPF_JEQ, 0, ETHERTYPE_IP, 2), /* if (r0 == IP) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_B, offsetof(struct ether_arp, arp_hln)), /* r0 = hw addr length */ + BPF_JMP_IMM(BPF_JEQ, 0, sizeof(struct ether_addr), 2), /* if (r0 == sizeof(ether_addr)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + BPF_LD_ABS(BPF_B, offsetof(struct ether_arp, arp_pln)), /* r0 = protocol addr length */ + BPF_JMP_IMM(BPF_JEQ, 0, sizeof(struct in_addr), 2), /* if (r0 == sizeof(in_addr)) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* drop packets from our own mac address */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_sha)), /* r0 = first four bytes of packet mac address */ + BPF_JMP_IMM(BPF_JNE, 0, be32toh(mac.u32[0]), 4), /* if (r0 != first four bytes of our mac address) skip 4 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_sha) + 4), /* r0 = last two bytes of packet mac address */ + BPF_JMP_IMM(BPF_JNE, 0, be16toh(mac.u16[2]), 2), /* if (r0 != last two bytes of our mac address) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* + * We listen for two kinds of packets: + * Conflicts) + * These are requests or replies with the sender address not set to INADDR_ANY. The + * conflicted address is the sender address, remember this in r7. + * Probes) + * These are requests with the sender address set to INADDR_ANY. The probed address + * is the target address, remember this in r7. + * Any other packets are dropped. + */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_spa)), /* r0 = sender ip address */ + BPF_JMP_IMM(BPF_JEQ, 0, 0, 7), /* if (r0 == 0) skip 7 */ + BPF_MOV_REG(7, 0), /* r7 = r0 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_op)), /* r0 = operation */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REQUEST, 3), /* if (r0 == request) skip 3 */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REPLY, 2), /* if (r0 == reply) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + BPF_JMP_IMM(BPF_JA, 0, 0, 6), /* skip 6 */ + BPF_LD_ABS(BPF_W, offsetof(struct ether_arp, arp_tpa)), /* r0 = target ip address */ + BPF_MOV_REG(7, 0), /* r7 = r0 */ + BPF_LD_ABS(BPF_H, offsetof(struct ether_arp, arp_op)), /* r0 = operation */ + BPF_JMP_IMM(BPF_JEQ, 0, ARPOP_REQUEST, 2), /* if (r0 == request) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* check if the probe or conflict is for an address we are monitoring */ + BPF_STX_MEM(BPF_W, 10, 7, -4), /* *(uint32_t*)fp - 4 = r7 */ + BPF_MOV_REG(2, 10), /* r2 = fp */ + BPF_ALU_IMM(BPF_ADD, 2, -4), /* r2 -= 4 */ + BPF_LD_MAP_FD(1, mapfd), /* r1 = mapfd */ + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), /* r0 = map_lookup_elem(r1, r2) */ + BPF_JMP_IMM(BPF_JNE, 0, 0, 2), /* if (r0 != NULL) skip 2 */ + BPF_MOV_IMM(0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), /* return */ + + /* return exactly the packet length*/ + BPF_MOV_IMM(0, sizeof(struct ether_arp)), /* r0 = sizeof(struct ether_arp) */ + BPF_EXIT_INSN(), /* return */ + }; + union bpf_attr attr; + int progfd; + + memset(&attr, 0, sizeof(attr)); + attr = (union bpf_attr){ + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .insns = (uint64_t)(unsigned long)prog, + .insn_cnt = sizeof(prog) / sizeof(*prog), + .license = (uint64_t)(unsigned long)"ASL", + }; + + progfd = n_acd_syscall_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (progfd < 0) + return -errno; + + *progfdp = progfd; + return 0; +} diff --git a/src/n-acd-private.h b/src/n-acd-private.h new file mode 100644 index 0000000000..3f20791234 --- /dev/null +++ b/src/n-acd-private.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/timer.h" +#include "n-acd.h" + +typedef struct NAcdEventNode NAcdEventNode; + +#define _cleanup_(_x) __attribute__((__cleanup__(_x))) +#define _public_ __attribute__((__visibility__("default"))) + +/* This augments the error-codes with internal ones that are never exposed. */ +enum { + _N_ACD_INTERNAL = _N_ACD_E_N, + + N_ACD_E_DROPPED, +}; + +enum { + N_ACD_PROBE_STATE_PROBING, + N_ACD_PROBE_STATE_CONFIGURING, + N_ACD_PROBE_STATE_ANNOUNCING, + N_ACD_PROBE_STATE_FAILED, +}; + +struct NAcdConfig { + int ifindex; + unsigned int transport; + uint8_t mac[ETH_ALEN]; + size_t n_mac; +}; + +#define N_ACD_CONFIG_NULL(_x) { \ + .transport = _N_ACD_TRANSPORT_N, \ + } + +struct NAcdProbeConfig { + struct in_addr ip; + uint64_t timeout_msecs; +}; + +#define N_ACD_PROBE_CONFIG_NULL(_x) { \ + .timeout_msecs = N_ACD_TIMEOUT_RFC5227, \ + } + +struct NAcdEventNode { + CList acd_link; + CList probe_link; + NAcdEvent event; + uint8_t sender[ETH_ALEN]; + bool is_public : 1; +}; + +#define N_ACD_EVENT_NODE_NULL(_x) { \ + .acd_link = C_LIST_INIT((_x).acd_link), \ + .probe_link = C_LIST_INIT((_x).probe_link), \ + } + +struct NAcd { + unsigned long n_refs; + unsigned int seed; + int fd_epoll; + int fd_socket; + CRBTree ip_tree; + CList event_list; + Timer timer; + + /* BPF map */ + int fd_bpf_map; + size_t n_bpf_map; + size_t max_bpf_map; + + /* configuration */ + int ifindex; + uint8_t mac[ETH_ALEN]; + + /* flags */ + bool preempted : 1; +}; + +#define N_ACD_NULL(_x) { \ + .n_refs = 1, \ + .fd_epoll = -1, \ + .fd_socket = -1, \ + .ip_tree = C_RBTREE_INIT, \ + .event_list = C_LIST_INIT((_x).event_list), \ + .timer = TIMER_NULL((_x).timer), \ + .fd_bpf_map = -1, \ + } + +struct NAcdProbe { + NAcd *acd; + CRBNode ip_node; + CList event_list; + Timeout timeout; + + /* configuration */ + struct in_addr ip; + uint64_t timeout_multiplier; + void *userdata; + + /* state */ + unsigned int state; + unsigned int n_iteration; + unsigned int defend; + uint64_t last_defend; +}; + +#define N_ACD_PROBE_NULL(_x) { \ + .ip_node = C_RBNODE_INIT((_x).ip_node), \ + .event_list = C_LIST_INIT((_x).event_list), \ + .timeout = TIMEOUT_INIT((_x).timeout), \ + .state = N_ACD_PROBE_STATE_PROBING, \ + .defend = N_ACD_DEFEND_NEVER, \ + } + +/* events */ + +int n_acd_event_node_new(NAcdEventNode **nodep); +NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node); + +/* contexts */ + +void n_acd_remember(NAcd *acd, uint64_t now, bool success); +int n_acd_raise(NAcd *acd, NAcdEventNode **nodep, unsigned int event); +int n_acd_send(NAcd *acd, const struct in_addr *tpa, const struct in_addr *spa); +int n_acd_ensure_bpf_map_space(NAcd *acd); + +/* probes */ + +int n_acd_probe_new(NAcdProbe **probep, NAcd *acd, NAcdProbeConfig *config); +int n_acd_probe_raise(NAcdProbe *probe, NAcdEventNode **nodep, unsigned int event); +int n_acd_probe_handle_timeout(NAcdProbe *probe); +int n_acd_probe_handle_packet(NAcdProbe *probe, struct ether_arp *packet, bool hard_conflict); + +/* eBPF */ + +int n_acd_bpf_map_create(int *mapfdp, size_t max_elements); +int n_acd_bpf_map_add(int mapfd, struct in_addr *addr); +int n_acd_bpf_map_remove(int mapfd, struct in_addr *addr); + +int n_acd_bpf_compile(int *progfdp, int mapfd, struct ether_addr *mac); + +/* inline helpers */ + +static inline int n_acd_errno(void) { + /* + * Compilers continuously warn about uninitialized variables since they + * cannot deduce that `return -errno;` will always be negative. This + * small wrapper makes sure compilers figure that out. Use it as + * replacement for `errno` read access. Yes, it generates worse code, + * but only marginally and only affects slow-paths. + */ + return abs(errno) ? : EIO; +} + +static inline void n_acd_event_node_freep(NAcdEventNode **node) { + if (*node) + n_acd_event_node_free(*node); +} + +static inline void n_acd_closep(int *fdp) { + if (*fdp >= 0) + close(*fdp); +} diff --git a/src/n-acd-probe.c b/src/n-acd-probe.c new file mode 100644 index 0000000000..8c233b56a7 --- /dev/null +++ b/src/n-acd-probe.c @@ -0,0 +1,636 @@ +/* + * IPv4 Address Conflict Detection + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" +#include "n-acd-private.h" + +/* + * These parameters and timing intervals specified in RFC-5227. The original + * values are: + * + * PROBE_NUM 3 + * PROBE_WAIT 1s + * PROBE_MIN 1s + * PROBE_MAX 3s + * ANNOUNCE_NUM 3 + * ANNOUNCE_WAIT 2s + * ANNOUNCE_INTERVAL 2s + * MAX_CONFLICTS 10 + * RATE_LIMIT_INTERVAL 60s + * DEFEND_INTERVAL 10s + * + * If we assume a best-case and worst-case scenario for non-conflicted runs, we + * end up with a runtime between 4s and 9s to finish the probe. Then it still + * takes a fixed 4s to finish the announcements. + * + * RFC 5227 section 1.1: + * [...] (Note that the values listed here are fixed constants; they are + * not intended to be modifiable by implementers, operators, or end users. + * These constants are given symbolic names here to facilitate the writing + * of future standards that may want to reference this document with + * different values for these named constants; however, at the present time + * no such future standards exist.) [...] + * + * Unfortunately, no-one ever stepped up to write a "future standard" to revise + * the timings. A 9s timeout for successful link setups is not acceptable today. + * Hence, we will just go forward and ignore the proposed values. On both + * wired and wireless local links round-trip latencies of below 3ms are common. + * We require the caller to set a timeout multiplier, where 1 corresponds to a + * total probe time between 0.5 ms and 1.0 ms. On modern networks a multiplier + * of about 100 should be a reasonable default. To comply with the RFC select a + * multiplier of 9000. + */ +#define N_ACD_RFC_PROBE_NUM (3) +#define N_ACD_RFC_PROBE_WAIT_NSEC (UINT64_C(111111)) /* 1/9 ms */ +#define N_ACD_RFC_PROBE_MIN_NSEC (UINT64_C(111111)) /* 1/9 ms */ +#define N_ACD_RFC_PROBE_MAX_NSEC (UINT64_C(333333)) /* 3/9 ms */ +#define N_ACD_RFC_ANNOUNCE_NUM (3) +#define N_ACD_RFC_ANNOUNCE_WAIT_NSEC (UINT64_C(222222)) /* 2/9 ms */ +#define N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC (UINT64_C(222222)) /* 2/9 ms */ +#define N_ACD_RFC_MAX_CONFLICTS (10) +#define N_ACD_RFC_RATE_LIMIT_INTERVAL_NSEC (UINT64_C(60000000000)) /* 60s */ +#define N_ACD_RFC_DEFEND_INTERVAL_NSEC (UINT64_C(10000000000)) /* 10s */ + +/** + * XXX + */ +_public_ int n_acd_probe_config_new(NAcdProbeConfig **configp) { + _cleanup_(n_acd_probe_config_freep) NAcdProbeConfig *config = NULL; + + config = malloc(sizeof(*config)); + if (!config) + return -ENOMEM; + + *config = (NAcdProbeConfig)N_ACD_PROBE_CONFIG_NULL(*config); + + *configp = config; + config = NULL; + return 0; +} + +/** + * XXX + */ +_public_ NAcdProbeConfig *n_acd_probe_config_free(NAcdProbeConfig *config) { + if (!config) + return NULL; + + free(config); + + return NULL; +} + +/** + * XXX + */ +_public_ void n_acd_probe_config_set_ip(NAcdProbeConfig *config, struct in_addr ip) { + config->ip = ip; +} + +/** + * XXX + */ +_public_ void n_acd_probe_config_set_timeout(NAcdProbeConfig *config, uint64_t msecs) { + config->timeout_msecs = msecs; +} + +static void n_acd_probe_schedule(NAcdProbe *probe, uint64_t n_timeout, unsigned int n_jitter) { + uint64_t n_time; + + timer_now(&probe->acd->timer, &n_time); + n_time += n_timeout; + + /* + * ACD specifies jitter values to reduce packet storms on the local + * link. This call accepts the maximum relative jitter value in + * nanoseconds as @n_jitter. We then use rand_r(3p) to get a + * pseudo-random jitter on top of the real timeout given as @n_timeout. + */ + if (n_jitter) { + uint64_t random; + + random = ((uint64_t)rand_r(&probe->acd->seed) << 32) | (uint64_t)rand_r(&probe->acd->seed); + n_time += random % n_jitter; + } + + timeout_schedule(&probe->timeout, &probe->acd->timer, n_time); +} + +static void n_acd_probe_unschedule(NAcdProbe *probe) { + timeout_unschedule(&probe->timeout); +} + +static bool n_acd_probe_is_unique(NAcdProbe *probe) { + NAcdProbe *sibling; + + if (!c_rbnode_is_linked(&probe->ip_node)) + return false; + + sibling = c_rbnode_entry(c_rbnode_next(&probe->ip_node), NAcdProbe, ip_node); + if (sibling && sibling->ip.s_addr == probe->ip.s_addr) + return false; + + sibling = c_rbnode_entry(c_rbnode_prev(&probe->ip_node), NAcdProbe, ip_node); + if (sibling && sibling->ip.s_addr == probe->ip.s_addr) + return false; + + return true; +} + +static int n_acd_probe_link(NAcdProbe *probe) { + int r; + + /* + * Make sure the kernel bpf map has space for at least one more + * entry. + */ + r = n_acd_ensure_bpf_map_space(probe->acd); + if (r) + return r; + + /* + * Link entry into context, indexed by its IP. Note that we allow + * duplicates just fine. It is up to you to decide whether to avoid + * duplicates, if you don't want them. Duplicates on the same context + * do not conflict with each other, though. + */ + { + CRBNode **slot, *parent; + NAcdProbe *other; + + slot = &probe->acd->ip_tree.root; + parent = NULL; + while (*slot) { + other = c_rbnode_entry(*slot, NAcdProbe, ip_node); + parent = *slot; + if (probe->ip.s_addr < other->ip.s_addr) + slot = &(*slot)->left; + else + slot = &(*slot)->right; + } + + c_rbtree_add(&probe->acd->ip_tree, parent, slot, &probe->ip_node); + } + + /* + * Add the ip address to the map, if it is not already there. + */ + if (n_acd_probe_is_unique(probe)) { + r = n_acd_bpf_map_add(probe->acd->fd_bpf_map, &probe->ip); + if (r) { + /* + * Make sure the IP address is linked in userspace iff + * it is linked in the kernel. + */ + c_rbnode_unlink(&probe->ip_node); + return r; + } + ++probe->acd->n_bpf_map; + } + + return 0; +} + +static void n_acd_probe_unlink(NAcdProbe *probe) { + int r; + + /* + * If this is the only probe for a given IP, remove the IP from the + * kernel BPF map. + */ + if (n_acd_probe_is_unique(probe)) { + r = n_acd_bpf_map_remove(probe->acd->fd_bpf_map, &probe->ip); + assert(r >= 0); + --probe->acd->n_bpf_map; + } + c_rbnode_unlink(&probe->ip_node); +} + +int n_acd_probe_new(NAcdProbe **probep, NAcd *acd, NAcdProbeConfig *config) { + _cleanup_(n_acd_probe_freep) NAcdProbe *probe = NULL; + int r; + + if (!config->ip.s_addr) + return N_ACD_E_INVALID_ARGUMENT; + + probe = malloc(sizeof(*probe)); + if (!probe) + return -ENOMEM; + + *probe = (NAcdProbe)N_ACD_PROBE_NULL(*probe); + probe->acd = n_acd_ref(acd); + probe->ip = config->ip; + + /* + * We use the provided timeout-length as multiplier for all our + * timeouts. The provided timeout defines the maximum length of an + * entire probe-interval until the first announcement. Given the + * spec-provided parameters, this ends up as: + * + * PROBE_WAIT + PROBE_MAX + PROBE_MAX + ANNOUNCE_WAIT + * = 1s + 3s + 3s + 2s + * = 9s + * + * Hence, the default value for this timeout is 9000ms, which just + * ends up matching the spec-provided values. + * + * What we now semantically do is divide this timeout by 1ns/1000000. + * This first turns it into nanoseconds, then strips the unit by + * turning it into a multiplier. However, rather than performing the + * division here, we multiplier all our timeouts by 1000000 statically + * at compile time. Therefore, we can use the user-provided timeout as + * unmodified multiplier. No conversion necessary. + */ + probe->timeout_multiplier = config->timeout_msecs; + + r = n_acd_probe_link(probe); + if (r) + return r; + + /* + * Now that everything is set up, we have to send the first probe. This + * is done after ~PROBE_WAIT seconds, hence we schedule our timer. + * In case no timeout-multiplier is set, we pretend we already sent all + * probes successfully and schedule the timer so we proceed with the + * announcements. We must schedule a fake timer there, since we are not + * allowed to advance the state machine outside of n_acd_dispatch(). + */ + if (probe->timeout_multiplier) { + probe->n_iteration = 0; + n_acd_probe_schedule(probe, + 0, + probe->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_NSEC); + } else { + probe->n_iteration = N_ACD_RFC_PROBE_NUM; + n_acd_probe_schedule(probe, 0, 0); + } + + *probep = probe; + probe = NULL; + return 0; +} + +/** + * XXX + */ +_public_ NAcdProbe *n_acd_probe_free(NAcdProbe *probe) { + NAcdEventNode *node, *t_node; + + if (!probe) + return NULL; + + c_list_for_each_entry_safe(node, t_node, &probe->event_list, probe_link) + n_acd_event_node_free(node); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->acd = n_acd_unref(probe->acd); + free(probe); + + return NULL; +} + +int n_acd_probe_raise(NAcdProbe *probe, NAcdEventNode **nodep, unsigned int event) { + _cleanup_(n_acd_event_node_freep) NAcdEventNode *node = NULL; + int r; + + r = n_acd_raise(probe->acd, &node, event); + if (r) + return r; + + switch (event) { + case N_ACD_EVENT_READY: + node->event.ready.probe = probe; + break; + case N_ACD_EVENT_USED: + node->event.used.probe = probe; + break; + case N_ACD_EVENT_DEFENDED: + node->event.defended.probe = probe; + break; + case N_ACD_EVENT_CONFLICT: + node->event.conflict.probe = probe; + break; + default: + assert(0); + return -EIO; + } + + c_list_link_tail(&probe->event_list, &node->probe_link); + + if (nodep) + *nodep = node; + node = NULL; + return 0; +} + +int n_acd_probe_handle_timeout(NAcdProbe *probe) { + int r; + + switch (probe->state) { + case N_ACD_PROBE_STATE_PROBING: + /* + * We are still PROBING. We send 3 probes with a random timeout + * scheduled between each. If, after a fixed timeout, we did + * not receive any conflict we consider the probing successful. + */ + if (probe->n_iteration < N_ACD_RFC_PROBE_NUM) { + /* + * We have not sent all 3 probes, yet. A timer fired, + * so we are ready to send the next probe. If this is + * the third probe, schedule a timer for ANNOUNCE_WAIT + * to give other peers a chance to answer. If this is + * not the third probe, wait between PROBE_MIN and + * PROBE_MAX for the next probe. + */ + + r = n_acd_send(probe->acd, &probe->ip, NULL); + if (r) { + if (r != -N_ACD_E_DROPPED) + return r; + + /* + * Packet was dropped, and we know about it. It + * never reached the network. Reasons are + * manifold, and n_acd_send() raises events if + * necessary. + * From a probe-perspective, we simply pretend + * we never sent the probe and schedule a + * timeout for the next probe, effectively + * doubling a single probe-interval. + */ + } else { + /* Successfully sent, so advance counter. */ + ++probe->n_iteration; + } + + if (probe->n_iteration < N_ACD_RFC_PROBE_NUM) + n_acd_probe_schedule(probe, + probe->timeout_multiplier * N_ACD_RFC_PROBE_MIN_NSEC, + probe->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_NSEC - N_ACD_RFC_PROBE_MIN_NSEC)); + else + n_acd_probe_schedule(probe, + probe->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_NSEC, + 0); + } else { + /* + * All 3 probes succeeded and we waited enough to + * consider this address usable by now. Do not announce + * the address, yet. We must first give the caller a + * chance to configure the address (so they can answer + * ARP requests), before announcing it. + */ + r = n_acd_probe_raise(probe, NULL, N_ACD_EVENT_READY); + if (r) + return r; + + probe->state = N_ACD_PROBE_STATE_CONFIGURING; + } + + break; + + case N_ACD_PROBE_STATE_ANNOUNCING: + /* + * We are ANNOUNCING, meaning the caller configured the address + * on the interface and is actively using it. We send 3 + * announcements out, in a short interval, and then just + * perform passive conflict detection. + * Note that once all 3 announcements are sent, we no longer + * schedule a timer, so this part should not trigger, anymore. + */ + + r = n_acd_send(probe->acd, &probe->ip, &probe->ip); + if (r) { + if (r != -N_ACD_E_DROPPED) + return r; + + /* + * See above in STATE_PROBING for details. We know the + * packet was never sent, so we simply try again after + * extending the timer. + */ + } else { + /* Successfully sent, so advance counter. */ + ++probe->n_iteration; + } + + if (probe->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) { + /* + * Announcements are always scheduled according to the + * time-intervals specified in the spec. We always use + * the RFC5227-mandated multiplier. + * If you reconsider this, note that timeout_multiplier + * might be 0 here. + */ + n_acd_probe_schedule(probe, + N_ACD_TIMEOUT_RFC5227 * N_ACD_RFC_ANNOUNCE_INTERVAL_NSEC, + 0); + } + + break; + + case N_ACD_PROBE_STATE_CONFIGURING: + case N_ACD_PROBE_STATE_FAILED: + default: + /* + * There are no timeouts in these states. If we trigger one, + * something is fishy. + */ + assert(0); + return -EIO; + } + + return 0; +} + +int n_acd_probe_handle_packet(NAcdProbe *probe, struct ether_arp *packet, bool hard_conflict) { + NAcdEventNode *node; + uint64_t now; + int r; + + timer_now(&probe->acd->timer, &now); + + switch (probe->state) { + case N_ACD_PROBE_STATE_PROBING: + /* + * Regardless whether this is a hard or soft conflict, we must + * treat this as a probe failure. That is, notify the caller of + * the conflict and wait for further instructions. We do not + * react to this, until the caller tells us what to do, but we + * do stop sending further probes. + */ + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_USED); + if (r) + return r; + + node->event.used.sender = node->sender; + node->event.used.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->state = N_ACD_PROBE_STATE_FAILED; + + break; + + case N_ACD_PROBE_STATE_CONFIGURING: + /* + * We are waiting for the caller to configure the interface and + * start ANNOUNCING. In this state, we cannot defend the + * address as that would indicate that it is ready to be used, + * and we cannot signal CONFLICT or USED as the caller may + * already have started to use the address (and may have + * configured the engine to always defend it, which means they + * should be able to rely on never losing it after READY). + * Simply drop the event, and rely on the anticipated ANNOUNCE + * to trigger it again. + */ + + break; + + case N_ACD_PROBE_STATE_ANNOUNCING: { + /* + * We were already instructed to announce the address, which + * means the address is configured and in use. Hence, the + * caller is responsible to serve regular ARP queries. Meaning, + * we can ignore any soft conflicts (other peers doing ACD). + * + * But if we see a hard-conflict, we either defend the address + * according to the caller's instructions, or we report the + * conflict and bail out. + */ + bool conflict = false, rate_limited = false; + + if (!hard_conflict) + break; + + rate_limited = now < probe->last_defend + N_ACD_RFC_DEFEND_INTERVAL_NSEC; + + switch (probe->defend) { + case N_ACD_DEFEND_NEVER: + conflict = true; + break; + case N_ACD_DEFEND_ONCE: + if (rate_limited) { + conflict = true; + break; + } + + /* fallthrough */ + case N_ACD_DEFEND_ALWAYS: + if (!rate_limited) { + r = n_acd_send(probe->acd, &probe->ip, &probe->ip); + if (r) { + if (r != -N_ACD_E_DROPPED) + return r; + + if (probe->defend == N_ACD_DEFEND_ONCE) { + conflict = true; + break; + } + } + + if (r != -N_ACD_E_DROPPED) + probe->last_defend = now; + } + + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_DEFENDED); + if (r) + return r; + + node->event.defended.sender = node->sender; + node->event.defended.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + break; + } + + if (conflict) { + r = n_acd_probe_raise(probe, &node, N_ACD_EVENT_CONFLICT); + if (r) + return r; + + node->event.conflict.sender = node->sender; + node->event.conflict.n_sender = ETH_ALEN; + memcpy(node->sender, packet->arp_sha, ETH_ALEN); + + n_acd_probe_unschedule(probe); + n_acd_probe_unlink(probe); + probe->state = N_ACD_PROBE_STATE_FAILED; + } + + break; + } + + case N_ACD_PROBE_STATE_FAILED: + default: + /* + * We are not listening for packets in these states. If we receive one, + * something is fishy. + */ + assert(0); + return -EIO; + } + + return 0; +} + +/** + * n_acd_probe_set_userdata - XXX + */ +_public_ void n_acd_probe_set_userdata(NAcdProbe *probe, void *userdata) { + probe->userdata = userdata; +} + +/** + * n_acd_probe_get_userdata - XXX + */ +_public_ void n_acd_probe_get_userdata(NAcdProbe *probe, void **userdatap) { + *userdatap = probe->userdata; +} + +/** + * n_acd_probe_announce() - announce the configured IP address + * @probe: probe object + * @defend: defence policy + * + * Announce the IP address on the local link, and start defending it according + * to the given policy, which mut be one of N_ACD_DEFEND_ONCE, + * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS. + * + * This must be called in response to an N_ACD_EVENT_READY event, and only + * after the given address has been configured on the given network interface. + * + * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defence policy + * is invalid, negative error code on failure. + */ +_public_ int n_acd_probe_announce(NAcdProbe *probe, unsigned int defend) { + if (defend >= _N_ACD_DEFEND_N) + return N_ACD_E_INVALID_ARGUMENT; + + probe->state = N_ACD_PROBE_STATE_ANNOUNCING; + probe->defend = defend; + probe->n_iteration = 0; + + /* + * We must schedule a fake-timeout, since we are not allowed to + * advance the state-machine outside of n_acd_dispatch(). + */ + n_acd_probe_schedule(probe, 0, 0); + + return 0; +} diff --git a/src/n-acd.c b/src/n-acd.c index 266e5d6f2a..def56a2152 100644 --- a/src/n-acd.c +++ b/src/n-acd.c @@ -1,188 +1,38 @@ /* * IPv4 Address Conflict Detection - * - * This implements the main n-acd API. It is built around an epoll-fd to - * encapsulate a timerfd+socket. The n-acd context has quite straightforward - * lifetime rules. The parameters must be set when the engine is started, and - * they can only be changed by stopping and restartding the engine. The engine - * is started on demand and stopped when no longer needed. - * During the entire lifetime the context can be dispatched. That is, the - * dispatcher does not have to be aware of the context state. After each call - * to dispatch(), the caller must pop all pending events until -EAGAIN is - * returned. - * - * If a conflict is detected, the ACD engine reports to the caller and stops - * the engine. The caller can now modify parameters and restart the engine, if - * required. */ #include #include +#include #include #include #include +#include #include -#include -#include #include -#include #include #include -#include -#include #include #include #include #include #include -#include #include #include #include "n-acd.h" - -#define _public_ __attribute__((__visibility__("default"))) - -/* - * These parameters and timing intervals specified in RFC-5227. The original - * values are: - * - * PROBE_NUM 3 - * PROBE_WAIT 1s - * PROBE_MIN 1s - * PROBE_MAX 3s - * ANNOUNCE_NUM 3 - * ANNOUNCE_WAIT 2s - * ANNOUNCE_INTERVAL 2s - * MAX_CONFLICTS 10 - * RATE_LIMIT_INTERVAL 60s - * DEFEND_INTERVAL 10s - * - * If we assume a best-case and worst-case scenario for non-conflicted runs, we - * end up with a runtime between 4s and 9s to finish the probe. Then it still - * takes a fixed 4s to finish the announcements. - * - * RFC 5227 section 1.1: - * [...] (Note that the values listed here are fixed constants; they are - * not intended to be modifiable by implementers, operators, or end users. - * These constants are given symbolic names here to facilitate the writing - * of future standards that may want to reference this document with - * different values for these named constants; however, at the present time - * no such future standards exist.) [...] - * - * Unfortunately, no-one ever stepped up to write a "future standard" to revise - * the timings. A 9s timeout for successful link setups is not acceptable today. - * Hence, we will just go forward and ignore the proposed values. On both - * wired and wireless local links round-trip latencies of below 3ms are common, - * while latencies above 10ms are rarely seen. We require the caller to set a - * timeout multiplier, where 1 corresponds to a total probe time of 0.5 ms and - * 1.0 ms. On modern networks a multiplier of about 100 should be a reasonable - * default. To comply with the RFC select a multiplier of 9000. - */ -#define N_ACD_RFC_PROBE_NUM (3) -#define N_ACD_RFC_PROBE_WAIT_USEC (UINT64_C(111)) /* 111us */ -#define N_ACD_RFC_PROBE_MIN_USEC (UINT64_C(111)) /* 111us */ -#define N_ACD_RFC_PROBE_MAX_USEC (UINT64_C(333)) /* 333us */ -#define N_ACD_RFC_ANNOUNCE_NUM (3) -#define N_ACD_RFC_ANNOUNCE_WAIT_USEC (UINT64_C(222)) /* 222us */ -#define N_ACD_RFC_ANNOUNCE_INTERVAL_USEC (UINT64_C(222)) /* 222us */ -#define N_ACD_RFC_MAX_CONFLICTS (10) -#define N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC (UINT64_C(60000000)) /* 60s */ -#define N_ACD_RFC_DEFEND_INTERVAL_USEC (UINT64_C(10000000)) /* 10s */ - -/* - * Fake ENETDOWN error-code. We use this as replacement for known EFOOBAR error - * codes. It is explicitly chosen to be outside the known error-code range. - * Whenever we are deep down in a call-stack and notice a ENETDOWN error, we - * return this instead. It is caught by the top-level dispatcher and then - * properly handled. - * This avoids gracefully handling ENETDOWN in call-stacks, but then continuing - * with some work in the callers without noticing the soft failure. - */ -#define N_ACD_E_DOWN (INT_MAX) - -#define TIME_INFINITY ((uint64_t) -1) +#include "n-acd-private.h" enum { N_ACD_EPOLL_TIMER, N_ACD_EPOLL_SOCKET, }; -enum { - N_ACD_STATE_INIT, - N_ACD_STATE_PROBING, - N_ACD_STATE_CONFIGURING, - N_ACD_STATE_ANNOUNCING, -}; - -typedef struct NAcdEventNode { - NAcdEvent event; - uint8_t sender[ETH_ALEN]; - CList link; -} NAcdEventNode; - -struct NAcd { - /* context */ - unsigned int seed; - int fd_epoll; - int fd_timer; - - /* configuration */ - NAcdConfig config; - uint8_t mac[ETH_ALEN]; - uint64_t timeout_multiplier; - - /* runtime */ - int fd_socket; - unsigned int state; - unsigned int n_iteration; - unsigned int n_conflicts; - unsigned int defend; - uint64_t last_defend; - uint64_t last_conflict; - - /* pending events */ - CList events; - NAcdEventNode *current; -}; - -static int n_acd_errno(void) { - /* - * Compilers continuously warn about uninitialized variables since they - * cannot deduce that `return -errno;` will always be negative. This - * small wrapper makes sure compilers figure that out. Use it as - * replacement for `errno` read access. Yes, it generates worse code, - * but only marginally and only affects slow-paths. - */ - return abs(errno) ? : EIO; -} - -static int n_acd_event_node_new(NAcdEventNode **nodep, unsigned int event) { - NAcdEventNode *node; - - node = calloc(1, sizeof(*node)); - if (!node) - return -ENOMEM; - - node->event.event = event; - node->link = (CList)C_LIST_INIT(node->link); - - *nodep = node; - - return 0; -} - -static NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node) { - if (!node) - return NULL; - - c_list_unlink(&node->link); - free(node); - - return NULL; -} - static int n_acd_get_random(unsigned int *random) { - uint8_t hash_seed[] = { 0x3a, 0x0c, 0xa6, 0xdd, 0x44, 0xef, 0x5f, 0x7a, 0x5e, 0xd7, 0x25, 0x37, 0xbf, 0x4e, 0x80, 0xa1 }; + uint8_t hash_seed[] = { + 0x3a, 0x0c, 0xa6, 0xdd, 0x44, 0xef, 0x5f, 0x7a, + 0x5e, 0xd7, 0x25, 0x37, 0xbf, 0x4e, 0x80, 0xa1, + }; CSipHash hash = C_SIPHASH_NULL; struct timespec ts; const uint8_t *p; @@ -203,7 +53,7 @@ static int n_acd_get_random(unsigned int *random) { if (p) c_siphash_append(&hash, p, 16); - r = clock_gettime(CLOCK_BOOTTIME, &ts); + r = clock_gettime(CLOCK_MONOTONIC, &ts); if (r < 0) return -n_acd_errno(); @@ -214,12 +64,243 @@ static int n_acd_get_random(unsigned int *random) { return 0; } -static void n_acd_reset(NAcd *acd) { - acd->state = N_ACD_STATE_INIT; - acd->defend = N_ACD_DEFEND_NEVER; - acd->n_iteration = 0; - acd->last_defend = 0; - timerfd_settime(acd->fd_timer, 0, &(struct itimerspec){}, NULL); +static int n_acd_socket_new(int *fdp, int fd_bpf_prog, NAcdConfig *config) { + const struct sockaddr_ll address = { + .sll_family = AF_PACKET, + .sll_protocol = htobe16(ETH_P_ARP), + .sll_ifindex = config->ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + int r, s = -1; + + s = socket(PF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); + if (s < 0) { + r = -n_acd_errno(); + goto error; + } + + if (fd_bpf_prog >= 0) { + r = setsockopt(s, SOL_SOCKET, SO_ATTACH_BPF, &fd_bpf_prog, sizeof(fd_bpf_prog)); + if (r < 0) + return -n_acd_errno(); + } + + r = bind(s, (struct sockaddr *)&address, sizeof(address)); + if (r < 0) { + r = -n_acd_errno(); + goto error; + } + + *fdp = s; + s = -1; + return 0; + +error: + if (s >= 0) + close(s); + return r; +} + +/** + * XXX + */ +_public_ int n_acd_config_new(NAcdConfig **configp) { + _cleanup_(n_acd_config_freep) NAcdConfig *config = NULL; + + config = malloc(sizeof(*config)); + if (!config) + return -ENOMEM; + + *config = (NAcdConfig)N_ACD_CONFIG_NULL(*config); + + *configp = config; + config = NULL; + return 0; +} + +/** + * XXX + */ +_public_ NAcdConfig *n_acd_config_free(NAcdConfig *config) { + if (!config) + return NULL; + + free(config); + + return NULL; +} + +/** + * XXX + */ +_public_ void n_acd_config_set_ifindex(NAcdConfig *config, int ifindex) { + config->ifindex = ifindex; +} + +/** + * XXX + */ +_public_ void n_acd_config_set_transport(NAcdConfig *config, unsigned int transport) { + config->transport = transport; +} + +/** + * XXX + */ +_public_ void n_acd_config_set_mac(NAcdConfig *config, const uint8_t *mac, size_t n_mac) { + config->n_mac = n_mac; + memcpy(config->mac, mac, n_mac > ETH_ALEN ? ETH_ALEN : n_mac); +} + +int n_acd_event_node_new(NAcdEventNode **nodep) { + NAcdEventNode *node; + + node = malloc(sizeof(*node)); + if (!node) + return -ENOMEM; + + *node = (NAcdEventNode)N_ACD_EVENT_NODE_NULL(*node); + + *nodep = node; + return 0; +} + +NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node) { + if (!node) + return NULL; + + c_list_unlink(&node->probe_link); + c_list_unlink(&node->acd_link); + free(node); + + return NULL; +} + +int n_acd_ensure_bpf_map_space(NAcd *acd) { + NAcdProbe *probe; + _cleanup_(n_acd_closep) int fd_map = -1, fd_prog = -1; + size_t max_map; + int r; + + if (acd->n_bpf_map < acd->max_bpf_map) + return 0; + + max_map = 2 * acd->max_bpf_map; + + r = n_acd_bpf_map_create(&fd_map, max_map); + if (r) + return r; + + c_rbtree_for_each_entry(probe, &acd->ip_tree, ip_node) { + r = n_acd_bpf_map_add(fd_map, &probe->ip); + if (r) + return r; + } + + r = n_acd_bpf_compile(&fd_prog, fd_map, (struct ether_addr*) acd->mac); + if (r) + return r; + + if (fd_prog >= 0) { + r = setsockopt(acd->fd_socket, SOL_SOCKET, SO_ATTACH_BPF, &fd_prog, sizeof(fd_prog)); + if (r) + return -n_acd_errno(); + } + + if (acd->fd_bpf_map >= 0) + close(acd->fd_bpf_map); + acd->fd_bpf_map = fd_map; + fd_map = -1; + acd->max_bpf_map = max_map; + return 0; +} + +/** + * n_acd_new() - create a new ACD context + * @acdp: output argument for context + * @config: configuration parameters + * + * Create a new ACD context and return it in @acdp. + * + * Return: 0 on success, or a negative error code on failure. + */ +_public_ int n_acd_new(NAcd **acdp, NAcdConfig *config) { + _cleanup_(n_acd_unrefp) NAcd *acd = NULL; + _cleanup_(n_acd_closep) int fd_bpf_prog = -1; + int r; + + if (config->ifindex <= 0 || + config->transport != N_ACD_TRANSPORT_ETHERNET || + config->n_mac != ETH_ALEN || + !memcmp(config->mac, (uint8_t[ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ETH_ALEN)) + return N_ACD_E_INVALID_ARGUMENT; + + acd = malloc(sizeof(*acd)); + if (!acd) + return -ENOMEM; + + *acd = (NAcd)N_ACD_NULL(*acd); + acd->ifindex = config->ifindex; + memcpy(acd->mac, config->mac, ETH_ALEN); + + r = n_acd_get_random(&acd->seed); + if (r) + return r; + + acd->fd_epoll = epoll_create1(EPOLL_CLOEXEC); + if (acd->fd_epoll < 0) + return -n_acd_errno(); + + r = timer_init(&acd->timer); + if (r < 0) + return r; + + acd->max_bpf_map = 8; + + r = n_acd_bpf_map_create(&acd->fd_bpf_map, acd->max_bpf_map); + if (r) + return r; + + r = n_acd_bpf_compile(&fd_bpf_prog, acd->fd_bpf_map, (struct ether_addr*) acd->mac); + if (r) + return r; + + r = n_acd_socket_new(&acd->fd_socket, fd_bpf_prog, config); + if (r) + return r; + + r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->timer.fd, + &(struct epoll_event){ + .events = EPOLLIN, + .data.u32 = N_ACD_EPOLL_TIMER, + }); + if (r < 0) + return -n_acd_errno(); + + r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->fd_socket, + &(struct epoll_event){ + .events = EPOLLIN, + .data.u32 = N_ACD_EPOLL_SOCKET, + }); + if (r < 0) + return -n_acd_errno(); + + *acdp = acd; + acd = NULL; + return 0; +} + +static void n_acd_free(NAcd *acd) { + NAcdEventNode *node, *t_node; + + if (!acd) + return; + + c_list_for_each_entry_safe(node, t_node, &acd->event_list, acd_link) + n_acd_event_node_free(node); + + assert(c_rbtree_is_empty(&acd->ip_tree)); if (acd->fd_socket >= 0) { assert(acd->fd_epoll >= 0); @@ -227,94 +308,16 @@ static void n_acd_reset(NAcd *acd) { close(acd->fd_socket); acd->fd_socket = -1; } -} -/** - * n_acd_new() - create a new ACD context - * @acdp: output argument for context - * - * Create a new ACD context and return it in @acdp. - * - * Return: 0 on success, or a negative error code on failure. - */ -_public_ int n_acd_new(NAcd **acdp) { - NAcd *acd; - int r; - - acd = calloc(1, sizeof(*acd)); - if (!acd) - return -ENOMEM; - - acd->fd_epoll = -1; - acd->fd_timer = -1; - acd->fd_socket = -1; - acd->state = N_ACD_STATE_INIT; - acd->defend = N_ACD_DEFEND_NEVER; - acd->events = (CList)C_LIST_INIT(acd->events); - acd->last_conflict = TIME_INFINITY; - - r = n_acd_get_random(&acd->seed); - if (r < 0) - return r; - - acd->fd_epoll = epoll_create1(EPOLL_CLOEXEC); - if (acd->fd_epoll < 0) { - r = -n_acd_errno(); - goto error; + if (acd->fd_bpf_map >= 0) { + close(acd->fd_bpf_map); + acd->fd_bpf_map = -1; } - acd->fd_timer = timerfd_create(CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK); - if (acd->fd_timer < 0) { - r = -n_acd_errno(); - goto error; - } - - r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->fd_timer, - &(struct epoll_event){ - .events = EPOLLIN, - .data.u32 = N_ACD_EPOLL_TIMER, - }); - if (r < 0) { - r = -n_acd_errno(); - goto error; - } - - *acdp = acd; - return 0; - -error: - n_acd_free(acd); - return r; -} - -/** - * n_acd_free() - free an ACD context - * - * Frees all resources held by the context. This may be called at any time, - * but doing so invalidates all data owned by the context. - * - * Return: NULL. - */ -_public_ NAcd *n_acd_free(NAcd *acd) { - NAcdEventNode *node; - - if (!acd) - return NULL; - - n_acd_reset(acd); - - acd->current = n_acd_event_node_free(acd->current); - - while ((node = c_list_first_entry(&acd->events, NAcdEventNode, link))) - n_acd_event_node_free(node); - - assert(acd->fd_socket < 0); - - if (acd->fd_timer >= 0) { + if (acd->timer.fd >= 0) { assert(acd->fd_epoll >= 0); - epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_timer, NULL); - close(acd->fd_timer); - acd->fd_timer = -1; + epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->timer.fd, NULL); + timer_deinit(&acd->timer); } if (acd->fd_epoll >= 0) { @@ -323,256 +326,166 @@ _public_ NAcd *n_acd_free(NAcd *acd) { } free(acd); +} +/** + * XXX + */ +_public_ NAcd *n_acd_ref(NAcd *acd) { + if (acd) + ++acd->n_refs; + return acd; +} + +/** + * XXX + */ +_public_ NAcd *n_acd_unref(NAcd *acd) { + if (acd && !--acd->n_refs) + n_acd_free(acd); return NULL; } +int n_acd_raise(NAcd *acd, NAcdEventNode **nodep, unsigned int event) { + NAcdEventNode *node; + int r; + + r = n_acd_event_node_new(&node); + if (r) + return r; + + node->event.event = event; + c_list_link_tail(&acd->event_list, &node->acd_link); + + if (nodep) + *nodep = node; + return 0; +} + +int n_acd_send(NAcd *acd, const struct in_addr *tpa, const struct in_addr *spa) { + struct sockaddr_ll address = { + .sll_family = AF_PACKET, + .sll_protocol = htobe16(ETH_P_ARP), + .sll_ifindex = acd->ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + }; + struct ether_arp arp = { + .ea_hdr = { + .ar_hrd = htobe16(ARPHRD_ETHER), + .ar_pro = htobe16(ETHERTYPE_IP), + .ar_hln = sizeof(acd->mac), + .ar_pln = sizeof(uint32_t), + .ar_op = htobe16(ARPOP_REQUEST), + }, + }; + ssize_t l; + int r; + + memcpy(arp.arp_sha, acd->mac, sizeof(acd->mac)); + memcpy(arp.arp_tpa, &tpa->s_addr, sizeof(uint32_t)); + + if (spa) + memcpy(arp.arp_spa, &spa->s_addr, sizeof(spa->s_addr)); + + l = sendto(acd->fd_socket, + &arp, + sizeof(arp), + MSG_NOSIGNAL, + (struct sockaddr *)&address, + sizeof(address)); + if (l < 0) { + if (errno == EAGAIN || errno == ENOBUFS) { + /* + * We never maintain outgoing queues. We rely on the + * network device to do that for us. In case the queues + * are full, or the kernel refuses to queue the packet + * for other reasons, we must tell our caller that the + * packet was dropped. + */ + return N_ACD_E_DROPPED; + } else if (errno == ENETDOWN || errno == ENXIO) { + /* + * These errors happen if the network device went down + * or was actually removed. We always propagate this as + * event, so the user can react accordingly (similarly + * to the recvmmsg(2) handler). In case the user does + * not immediately react, we also tell our caller that + * the packet was dropped, so we don't erroneously + * treat this as success. + */ + + r = n_acd_raise(acd, NULL, N_ACD_EVENT_DOWN); + if (r) + return r; + + return N_ACD_E_DROPPED; + } + + /* + * Random network error. We treat this as fatal and propagate + * the error, so it is noticed and can be investigated. + */ + return -n_acd_errno(); + } else if (l != (ssize_t)sizeof(arp)) { + /* + * Ugh, the kernel modified the packet. This is unexpected. We + * consider the packet lost. + */ + return N_ACD_E_DROPPED; + } + + return 0; +} + /** * n_acd_get_fd() - get pollable file descriptor * @acd: ACD context * @fdp: output argument for file descriptor * - * Returns a file descriptor in @fdp. This filedescriptor can be polled by + * Returns a file descriptor in @fdp. This file descriptor can be polled by * the caller to indicate when the ACD context can be dispatched. */ _public_ void n_acd_get_fd(NAcd *acd, int *fdp) { *fdp = acd->fd_epoll; } -static int n_acd_push_event(NAcd *acd, unsigned int event, uint16_t *operation, uint8_t (*sender)[6], uint8_t (*target)[4]) { - NAcdEventNode *node; - int r; - - r = n_acd_event_node_new(&node, event); - if (r < 0) - return r; - - switch (event) { - case N_ACD_EVENT_USED: - node->event.used.operation = be16toh(*operation); - memcpy(node->sender, sender, sizeof(node->sender)); - node->event.used.sender = node->sender; - node->event.used.n_sender = sizeof(node->sender); - memcpy(&node->event.used.target, target, sizeof(node->event.used.target)); - break; - case N_ACD_EVENT_CONFLICT: - node->event.conflict.operation = be16toh(*operation); - memcpy(node->sender, sender, sizeof(node->sender)); - node->event.used.sender = node->sender; - node->event.used.n_sender = sizeof(node->sender); - memcpy(&node->event.conflict.target, target, sizeof(node->event.conflict.target)); - break; - case N_ACD_EVENT_DEFENDED: - node->event.defended.operation = be16toh(*operation); - memcpy(node->sender, sender, sizeof(node->sender)); - node->event.used.sender = node->sender; - node->event.used.n_sender = sizeof(node->sender); - memcpy(&node->event.defended.target, target, sizeof(node->event.defended.target)); - break; - case N_ACD_EVENT_READY: - case N_ACD_EVENT_DOWN: - break; - default: - assert(0); - } - - c_list_link_tail(&acd->events, &node->link); - - return 0; -} - -static int n_acd_now(uint64_t *nowp) { - struct timespec ts; - int r; - - r = clock_gettime(CLOCK_BOOTTIME, &ts); - if (r < 0) - return -n_acd_errno(); - - *nowp = ts.tv_sec * UINT64_C(1000000) + ts.tv_nsec / UINT64_C(1000); - return 0; -} - -static int n_acd_schedule(NAcd *acd, uint64_t u_timeout, unsigned int u_jitter) { - uint64_t u_next = u_timeout; - int r; - - /* - * ACD specifies jitter values to reduce packet storms on the local - * link. This call accepts the maximum relative jitter value in - * microseconds as @u_jitter. We then use rand_r(3p) to get a - * pseudo-random jitter on top of the real timeout given as @u_timeout. - * Note that rand_r() is fine for this. Before you try to improve the - * RNG, you better spend some time securing ARP. - */ - if (u_jitter) - u_next += rand_r(&acd->seed) % u_jitter; - - /* - * Setting .it_value to 0 in timerfd_settime() disarms the timer. Avoid - * this and always schedule at least 1us. Otherwise, we'd have to - * recursively call into the time-out handler, which we really want to - * avoid. No reason to optimize performance here. - */ - if (!u_next) - u_next = 1; - - r = timerfd_settime(acd->fd_timer, 0, - &(struct itimerspec){ .it_value = { - .tv_sec = u_next / UINT64_C(1000000), - .tv_nsec = u_next % UINT64_C(1000000) * UINT64_C(1000), - } }, NULL); - if (r < 0) - return -n_acd_errno(); - - return 0; -} - -static int n_acd_send(NAcd *acd, const struct in_addr *spa) { - struct sockaddr_ll address = { - .sll_family = AF_PACKET, - .sll_protocol = htobe16(ETH_P_ARP), - .sll_ifindex = acd->config.ifindex, - .sll_halen = ETH_ALEN, - .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - }; - struct ether_arp arp = { - .ea_hdr.ar_hrd = htobe16(ARPHRD_ETHER), - .ea_hdr.ar_pro = htobe16(ETHERTYPE_IP), - .ea_hdr.ar_hln = sizeof(acd->mac), - .ea_hdr.ar_pln = sizeof(uint32_t), - .ea_hdr.ar_op = htobe16(ARPOP_REQUEST), - }; - ssize_t l; - - memcpy(arp.arp_sha, acd->mac, sizeof(acd->mac)); - memcpy(arp.arp_tpa, &acd->config.ip.s_addr, sizeof(uint32_t)); - - if (spa) - memcpy(arp.arp_spa, &spa->s_addr, sizeof(spa->s_addr)); - - l = sendto(acd->fd_socket, &arp, sizeof(arp), MSG_NOSIGNAL, (struct sockaddr *)&address, sizeof(address)); - if (l == (ssize_t)sizeof(arp)) { - /* Packet was properly sent. */ - return 0; - } else if (l >= 0) { - /* - * Ugh. The packet was truncated. This should not happen, but - * lets just pretend the packet was dropped. - */ - return 0; - } else if (errno == EAGAIN || errno == ENOBUFS) { - /* - * In case the output buffer is full, the packet is silently - * dropped. This is just as if the physical layer happened to - * drop the packet. We are not on a reliable medium, so no - * reason to pretend we are. - */ - return 0; - } else if (errno == ENETDOWN || errno == ENXIO) { - /* - * We get ENETDOWN if the network-device goes down or is - * removed. ENXIO might happen on async send-operations if the - * network-device was unplugged and thus the kernel is no - * longer aware of it. - * In any case, we do not allow proceeding with this socket. We - * stop the engine and notify the user gracefully. - */ - return -N_ACD_E_DOWN; - } - - return -n_acd_errno(); -} - -static void n_acd_remember_conflict(NAcd *acd, uint64_t now) { - if (++acd->n_conflicts >= N_ACD_RFC_MAX_CONFLICTS) { - acd->n_conflicts = N_ACD_RFC_MAX_CONFLICTS; - acd->last_conflict = now; - } -} - static int n_acd_handle_timeout(NAcd *acd) { + NAcdProbe *probe; + uint64_t now; int r; - switch (acd->state) { - case N_ACD_STATE_PROBING: - /* - * We are still PROBING. We send 3 probes with a random timeout - * scheduled between each. If, after a fixed timeout, we did - * not receive any conflict we consider the probing successful. - */ - if (acd->n_iteration >= N_ACD_RFC_PROBE_NUM) { - /* - * All 3 probes succeeded and we waited enough to - * consider this address usable by now. Do not announce - * the address, yet. We must first give the caller a - * chance to configure the address (so they can answer - * ARP requests), before announcing it. But our - * callbacks are not necessarily synchronous (we want - * to allow IPC there), so just notify the caller and - * wait for further instructions, thus effectively - * increasing the probe-wait. - */ - r = n_acd_push_event(acd, N_ACD_EVENT_READY, NULL, NULL, NULL); - if (r) - return r; + /* + * Read the current time once, and handle all timouts that triggered + * before the current time. Rereading the current time in each loop + * might risk creating a live-lock, and the fact that we read the + * time after reading the timer guarantees that the timeout which + * woke us up is hanlded. + * + * When there are no more timeouts to handle at the given time, we + * rearm the timer to potentially wake us up again in the future. + */ + timer_now(&acd->timer, &now); - acd->state = N_ACD_STATE_CONFIGURING; - } else { - /* - * We have not sent all 3 probes, yet. A timer fired, - * so we are ready to send the next probe. If this is - * the third probe, schedule a timer for ANNOUNCE_WAIT - * to give other peers a chance to answer. If this is - * not the third probe, wait between PROBE_MIN and - * PROBE_MAX for the next probe. - */ + for (;;) { + Timeout *timeout; - r = n_acd_send(acd, NULL); - if (r < 0) - return r; - - if (++acd->n_iteration >= N_ACD_RFC_PROBE_NUM) - r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_USEC, 0); - else - r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_PROBE_MIN_USEC, - acd->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_USEC - N_ACD_RFC_PROBE_MIN_USEC)); - if (r < 0) - return r; - } - - break; - - case N_ACD_STATE_ANNOUNCING: - /* - * We are ANNOUNCING, meaning the caller configured the address - * on the interface and is actively using it. We send 3 - * announcements out, in a short interval, and then just - * perform passive conflict detection. - * Note that once all 3 announcements are sent, we no longer - * schedule a timer, so this part should not trigger, anymore. - */ - - r = n_acd_send(acd, &acd->config.ip); - if (r < 0) + r = timer_pop_timeout(&acd->timer, now, &timeout); + if (r < 0) { return r; - - if (++acd->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) { - r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_INTERVAL_USEC, 0); - if (r < 0) - return r; + } else if (!timeout) { + /* + * There are no more timeouts pending before @now. Rearm + * the timer to fire again at the next timeout. + */ + timer_rearm(&acd->timer); + break; } - break; - - case N_ACD_STATE_INIT: - case N_ACD_STATE_CONFIGURING: - default: - /* - * There are no timeouts in these states. If we trigger one, - * something is fishy. Let the caller deal with this. - */ - return -EIO; + probe = (void *)timeout - offsetof(NAcdProbe, timeout); + r = n_acd_probe_handle_timeout(probe); + if (r) + return r; } return 0; @@ -580,136 +493,94 @@ static int n_acd_handle_timeout(NAcd *acd) { static int n_acd_handle_packet(NAcd *acd, struct ether_arp *packet) { bool hard_conflict; - uint64_t now; + NAcdProbe *probe; + uint32_t addr; + CRBNode *node; int r; /* - * Via BPF we discard any non-conflict packets. There are only 2 types - * that can pass: A conflict on the Sender Protocol Address, or a - * conflict on the Target Protocol Address. + * We are interested in 2 kinds of ARP messages: * - * The former we call a hard-conflict. It implies that the sender uses - * the address already. We must always catch this and in some way react - * to it. Any kind, REQUEST or REPLY must be caught (though it is - * unlikely that we ever catch REPLIES since they tend to be unicasts). + * 1) Someone who is *NOT* us sends *ANY* ARP message with our IP + * address as sender. This is never good, because it implies an + * address conflict. + * We call this a hard-conflict. * - * However, in case the Target Protocol Address matches, we just know - * that somebody is looking for the address. Hence, we must also check - * that the packet is an ARP-Probe (Sender Protocol Address is 0). If - * it is, it means someone else does ACD on our address. We call this a - * soft conflict. + * 2) Someone who is *NOT* us sends an ARP REQUEST without any sender + * IP, but our IP as target. This implies someone else performs an + * ARP Probe with our address. This also implies a conflict, but + * one that can be resolved by responding to the probe. + * We call this a soft-conflict. + * + * We are never interested in any other ARP message. The kernel already + * deals with everything else, hence, we can silently ignore those. + * + * Now, we simply check whether a sender-address is set. This allows us + * to distinguish both cases. We then check further conditions, so we + * can bail out early if neither is the case. + * + * Lastly, we perform a lookup in our probe-set to check whether the + * address actually matches, so we can let these probes dispatch the + * message. Note that we allow duplicate probes, so we need to dispatch + * each matching probe, not just one. */ - if (!memcmp(packet->arp_spa, (uint8_t[4]){ }, sizeof(packet->arp_spa)) && - !memcmp(packet->arp_tpa, &acd->config.ip.s_addr, sizeof(packet->arp_tpa)) && - packet->ea_hdr.ar_op == htobe16(ARPOP_REQUEST)) { - hard_conflict = false; - } else if (!memcmp(packet->arp_spa, &acd->config.ip.s_addr, sizeof(packet->arp_spa))) { + + if (memcmp(packet->arp_spa, (uint8_t[4]){ }, sizeof(packet->arp_spa))) { + memcpy(&addr, packet->arp_spa, sizeof(addr)); hard_conflict = true; + } else if (packet->ea_hdr.ar_op == htobe16(ARPOP_REQUEST)) { + memcpy(&addr, packet->arp_tpa, sizeof(addr)); + hard_conflict = false; } else { /* - * Ignore anything that is specific enough to match the BPF - * filter, but is none of the conflicts described above. - */ - return 0; - } - - r = n_acd_now(&now); - if (r < 0) - return r; - - switch (acd->state) { - case N_ACD_STATE_PROBING: - /* - * Regardless whether this is a hard or soft conflict, we must - * treat this as a probe failure. That is, notify the caller of - * the conflict and wait for further instructions. We do not - * react to this, until the caller tells us what to do. But we - * immediately stop the engine, since there is no point in - * continuing the probing. - */ - n_acd_remember_conflict(acd, now); - n_acd_reset(acd); - r = n_acd_push_event(acd, N_ACD_EVENT_USED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa); - if (r) - return r; - - break; - - case N_ACD_STATE_CONFIGURING: - /* - * We are waiting for the caller to configure the interface and - * start ANNOUNCING. In this state, we cannot defend the address - * as that would indicate that it is ready to be used, and we - * cannot signal CONFLICT or USED as the caller may already have - * started to use the address (and may have configured the engine - * to always defend it, which means they should be able to rely on - * never losing it after READY). Simply drop the event, and rely - * on the anticipated ANNOUNCE to trigger it again. - */ - - break; - - case N_ACD_STATE_ANNOUNCING: - /* - * We were already instructed to announce the address, which - * means the address is configured and in use. Hence, the - * caller is responsible to serve regular ARP queries. Meaning, - * we can ignore any soft conflicts (other peers doing ACD). - * - * But if we see a hard-conflict, we either defend the address - * according to the caller's instructions, or we report the - * conflict and bail out. - */ - - if (!hard_conflict) - break; - - if (acd->defend == N_ACD_DEFEND_NEVER) { - n_acd_remember_conflict(acd, now); - n_acd_reset(acd); - r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa); - if (r) - return r; - } else { - if (now > acd->last_defend + N_ACD_RFC_DEFEND_INTERVAL_USEC) { - r = n_acd_send(acd, &acd->config.ip); - if (r < 0) - return r; - - acd->last_defend = now; - r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa); - if (r) - return r; - } else if (acd->defend == N_ACD_DEFEND_ONCE) { - n_acd_remember_conflict(acd, now); - n_acd_reset(acd); - r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa); - if (r) - return r; - } else { - r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa); - if (r) - return r; - } - } - - break; - - case N_ACD_STATE_INIT: - default: - /* - * The socket should not be dispatched in those states, since - * it is neither allocated nor added to epoll. Fail hard if we - * trigger this somehow. + * The BPF filter will not let through any other packet. */ return -EIO; } + /* Find top-most node that matches @addr. */ + node = acd->ip_tree.root; + while (node) { + probe = c_rbnode_entry(node, NAcdProbe, ip_node); + if (addr < probe->ip.s_addr) + node = node->left; + else if (addr > probe->ip.s_addr) + node = node->right; + else + break; + } + + /* + * If the address is unknown, we drop the package. This might happen if + * the kernel queued the packet and passed the BPF filter, but we + * modified the set before dequeuing the message. + */ + if (!node) + return 0; + + /* Forward to left-most child that still matches @addr. */ + while (node->left && addr == c_rbnode_entry(node->left, + NAcdProbe, + ip_node)->ip.s_addr) + node = node->left; + + /* Iterate all matching entries in-order. */ + do { + probe = c_rbnode_entry(node, NAcdProbe, ip_node); + + r = n_acd_probe_handle_packet(probe, packet, hard_conflict); + if (r) + return r; + + node = c_rbnode_next(node); + } while (node && addr == c_rbnode_entry(node, + NAcdProbe, + ip_node)->ip.s_addr); + return 0; } static int n_acd_dispatch_timer(NAcd *acd, struct epoll_event *event) { - uint64_t v; int r; if (event->events & (EPOLLHUP | EPOLLERR)) { @@ -722,99 +593,113 @@ static int n_acd_dispatch_timer(NAcd *acd, struct epoll_event *event) { } if (event->events & EPOLLIN) { - for (unsigned int i = 0; i < 128; ++i) { - r = read(acd->fd_timer, &v, sizeof(v)); - if (r == sizeof(v)) { - /* - * We successfully read a timer-value. Handle it and - * return. We do NOT fall-through to EPOLLHUP handling, - * as we always must drain buffers first. - */ - return n_acd_handle_timeout(acd); - } else if (r >= 0) { - /* - * Kernel guarantees 8-byte reads; fail hard if it - * suddenly starts doing weird shit. No clue what to do - * with those values, anyway. - */ - return -EIO; - } else if (errno == EAGAIN) { - /* - * No more pending events. - */ - return 0; - } else { - /* - * Something failed. We use CLOCK_BOOTTIME, so - * ECANCELED cannot happen. Hence, there is no error - * that we could gracefully handle. Fail hard and let - * the caller deal with it. - */ - return -n_acd_errno(); - } - } + r = timer_read(&acd->timer); + if (r <= 0) + return r; - return N_ACD_E_PREEMPTED; + assert(r == TIMER_E_TRIGGERED); + + /* + * A timer triggered, handle all pending timeouts at a given + * point in time. There can only be a finite number of pending + * timeouts, any new ones will be in the future, so not handled + * now, but guaranteed to wake us up again when they do trigger. + */ + r = n_acd_handle_timeout(acd); + if (r) + return r; } return 0; } -static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) { - struct ether_arp packet; - ssize_t l; +static bool n_acd_packet_is_valid(NAcd *acd, void *packet, size_t n_packet) { + struct ether_arp *arp; - for (unsigned int i = 0; i < 128; ++i) { - /* - * Regardless whether EPOLLIN is set in @event->events, we always - * invoke recv(2). This is a safety-net for sockets, which always fetch - * queued errors on all syscalls. That means, if anything failed on the - * socket, we will be notified via recv(2). This simplifies the code - * and avoid magic EPOLLIN/ERR/HUP juggling. - * - * Note that we must use recv(2) over read(2), since the latter cannot - * deal with empty packets properly. - * - * We explicitly skip passing MSG_TRUNC here. We *WANT* - * overlong packets to be retrieved and truncated. Ethernet - * frames might not have byte-granular lengths. Real hardware - * does add trailing padding/garbage, so we must discard this - * here. - */ - l = recv(acd->fd_socket, &packet, sizeof(packet), 0); - if (l == (ssize_t)sizeof(packet)) { + /* + * The eBPF filter will ensure that this function always returns true, however, + * this allows the eBPF filter to be an optional optimization which is necessary + * on older kernels. + * + * See comments in n-acd-bpf.c for details. + */ + + if (n_packet != sizeof(*arp)) + return false; + + arp = packet; + + if (arp->arp_hrd != htobe16(ARPHRD_ETHER)) + return false; + + if (arp->arp_pro != htobe16(ETHERTYPE_IP)) + return false; + + if (arp->arp_hln != sizeof(struct ether_addr)) + return false; + + if (arp->arp_pln != sizeof(struct in_addr)) + return false; + + if (!memcmp(arp->arp_sha, acd->mac, sizeof(struct ether_addr))) + return false; + + if (memcmp(arp->arp_spa, &((struct in_addr) { INADDR_ANY }), sizeof(struct in_addr))) { + if (arp->arp_op != htobe16(ARPOP_REQUEST) && arp->arp_op != htobe16(ARPOP_REPLY)) + return false; + } else if (arp->arp_op != htobe16(ARPOP_REQUEST)) { + return false; + } + + return true; +} + +static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) { + const size_t n_batch = 8; + struct mmsghdr msgs[n_batch]; + struct iovec iovecs[n_batch]; + struct ether_arp data[n_batch]; + size_t i; + int r, n; + + for (i = 0; i < n_batch; ++i) { + iovecs[i].iov_base = data + i; + iovecs[i].iov_len = sizeof(data[i]); + msgs[i].msg_hdr = (struct msghdr){ + .msg_iov = iovecs + i, + .msg_iovlen = 1, + }; + } + + /* + * We always directly call into recvmmsg(2), regardless which EPOLL* + * event is signalled. On sockets, the recv(2)-family of syscalls does + * a suitable job of handling all possible scenarios and telling us + * about it. Hence, lets take the easy route and always ask the kernel + * about the current state. + */ + n = recvmmsg(acd->fd_socket, msgs, n_batch, 0, NULL); + if (n < 0) { + if (errno == ENETDOWN) { /* - * We read a full ARP packet. We never fall-through to EPOLLHUP - * handling, as we always must drain buffers first. + * We get ENETDOWN if the network-device goes down or + * is removed. This error is temporary and only queued + * once. Subsequent reads will simply return EAGAIN + * until the device is up again and has data queued. + * Usually, the caller should tear down all probes when + * an interface goes down, but we leave it up to the + * caller to decide what to do. We propagate the code + * and continue. */ - return n_acd_handle_packet(acd, &packet); - } else if (l >= 0) { - /* - * The BPF filter discards short packets, so error out - * if something slips through for any reason. Don't silently - * ignore it, since we explicitly want to know if something - * went fishy. - */ - return -EIO; - } else if (errno == ENETDOWN || errno == ENXIO) { - /* - * We get ENETDOWN if the network-device goes down or is - * removed. ENXIO might happen on async send-operations if the - * network-device was unplugged and thus the kernel is no - * longer aware of it. - * In any case, we do not allow proceeding with this socket. We - * stop the engine and notify the user gracefully. - */ - return -N_ACD_E_DOWN; + return n_acd_raise(acd, NULL, N_ACD_EVENT_DOWN); } else if (errno == EAGAIN) { /* - * We cannot read data from the socket (we got EAGAIN). As a safety net - * check for EPOLLHUP/ERR. Those cannot be disabled with epoll, so we - * must make sure to not busy-loop by ignoring them. Note that we know - * recv(2) on sockets to return an error if either of these epoll-flags - * is set. Hence, if we did not handle it above, we have no other way - * but treating those flags as fatal errors and returning them to the - * caller. + * There is no more data queued and we did not get + * preempted. Everything is good to go. + * As a safety-net against busy-looping, we do check + * for HUP/ERR. Neither should be set, since they imply + * error-dequeue behavior on all socket calls. Lets + * fail hard if we trigger it, so we can investigate. */ if (event->events & (EPOLLHUP | EPOLLERR)) return -EIO; @@ -822,35 +707,63 @@ static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) { return 0; } else { /* - * Cannot dispatch the packet. This might be due to OOM, HUP, - * or something else. We cannot handle it gracefully so forward - * to the caller. + * Something went wrong. Propagate the error-code, so + * this can be investigated. */ return -n_acd_errno(); } + } else if (n >= (ssize_t)n_batch) { + /* + * If all buffers were filled with data, we cannot be sure that + * there is nothing left to read. But to avoid starvation, we + * cannot loop on this condition. Instead, we mark the context + * as preempted so the caller can call us again. + * Note that in level-triggered event-loops this condition can + * be neglected, but in edge-triggered event-loops it is + * crucial to forward this information. + * + * On the other hand, there are several conditions where the + * kernel might return less batches than requested, but was + * still preempted. However, all of those cases require the + * preemption to have triggered a wakeup *after* we entered + * recvmmsg(). Hence, even if we did not recognize the + * preemption, an edge must have triggered and as such we will + * handle the event on the next turn. + */ + acd->preempted = true; } - return N_ACD_E_PREEMPTED; + for (i = 0; (ssize_t)i < n; ++i) { + if (!n_acd_packet_is_valid(acd, data + i, msgs[i].msg_len)) + continue; + /* + * Handle the packet. Bail out if something went wrong. Note + * that this must be fatal errors, since we discard all other + * packets that follow. + */ + r = n_acd_handle_packet(acd, data + i); + if (r) + return r; + } + + return 0; } /** - * n_acd_dispatch() - dispatch ACD context - * @acd: ACD context - * - * Return: 0 on successful dispatch of all pending events, N_ACD_E_PREEMPT in - * case there are more still more events to be dispatched, or a - * negative error code on failure. + * XXX */ _public_ int n_acd_dispatch(NAcd *acd) { struct epoll_event events[2]; int n, i, r = 0; - bool preempted = false; n = epoll_wait(acd->fd_epoll, events, sizeof(events) / sizeof(*events), 0); if (n < 0) { + /* Linux never returns EINTR if `timeout == 0'. */ return -n_acd_errno(); } + acd->preempted = false; + for (i = 0; i < n; ++i) { switch (events[i].data.u32) { case N_ACD_EPOLL_TIMER: @@ -860,35 +773,16 @@ _public_ int n_acd_dispatch(NAcd *acd) { r = n_acd_dispatch_socket(acd, events + i); break; default: + assert(0); r = 0; break; } - if (r == N_ACD_E_PREEMPTED) - preempted = true; - else if (r != 0) - break; - } - - if (r == -N_ACD_E_DOWN) { - /* - * N_ACD_E_DOWN is synthesized whenever we notice - * ENETDOWN-related errors on the network interface. This - * allows bailing out of deep call-paths and then handling the - * error gracefully here. - */ - n_acd_reset(acd); - r = n_acd_push_event(acd, N_ACD_EVENT_DOWN, NULL, NULL, NULL); if (r) return r; - - return 0; } - if (preempted) - return N_ACD_E_PREEMPTED; - else - return r; + return acd->preempted ? N_ACD_E_PREEMPTED : 0; } /** @@ -898,343 +792,75 @@ _public_ int n_acd_dispatch(NAcd *acd) { * * Returns a pointer to the next pending event. The event is still owend by * the context, and is only valid until the next call to n_acd_pop_event() - * or until the context is freed. + * or until the owning object is freed (either the ACD context or the indicated + * probe object). + * + * An event either originates on the ACD context, or one of the configured + * probes. If the event-type has a 'probe' pointer, it originated on the + * indicated probe (which is *never* NULL), otherwise it originated on the + * context. + * + * Users must call this function repeatedly until either an error is returned, + * or the event-pointer is NULL. Wakeups on the epoll-fd are only guaranteed + * for each batch of events. Hence, it is the callers responsibility to drain + * the event-queue somehow after each call to n_acd_dispatch(). Note that + * events can only be added by n_acd_dispatch(), hence, you cannot live-lock + * when draining the event queue. * * The possible events are: - * * N_ACD_EVENT_READY: The configured IP address was probed successfully + * * N_ACD_EVENT_READY: A configured IP address was probed successfully * and is ready to be used. Once configured on the * interface, the caller must call n_acd_announce() * to announce and start defending the address. - * No further events may be received before - * n_acd_announce() has been called. * * N_ACD_EVENT_USED: Someone is already using the IP address being - * probed. The engine was stopped, and the caller - * may restart it to try again. - * * N_ACD_EVENT_DEFENDED: A conflict was detected for the announced IP + * probed. The probe is put into stopped state and + * should be freed by the caller. + * * N_ACD_EVENT_DEFENDED: A conflict was detected for an announced IP * address, and the engine attempted to defend it. * This is purely informational, and no action is * required by the caller. - * * N_ACD_EVENT_CONFLICT: A conflict was detected for the announced IP - * address, and the engine was not able to defend + * * N_ACD_EVENT_CONFLICT: A conflict was detected for an announced IP + * address, and the probe was not able to defend * it (according to the configured policy). The - * engine has stoppde, the caller must stop using - * the address immediately, and may restart the - * engine to retry. - * * N_ACD_EVENT_DOWN: A network error was detected. The engine was - * stopped and it is the responsibility of the - * caller to restart it once the network may be - * functional again. + * probe halted, the caller must stop using + * the address immediately, and should free the probe. + * * N_ACD_EVENT_DOWN: The specified network interface was put down. The + * user is recommended to free *ALL* probes and + * recreate them as soon as the interface is up again. + * Note that this event is purely informational. The + * probes will continue running, but all packets will + * be blackholed, and no network packets are received, + * until the network is back up again. Hence, from an + * operational perspective, the legitimacy of the ACD + * probes is lost and the user better re-probes all + * addresses. * - * Returns: 0 on success, N_ACD_E_STOPPED if there are no more events and - * the engine has been stopped, N_ACD_E_DONE if there are no more - * events, but the engine is still running, or a negative error - * code on failure. + * Returns: 0 on success, negative error code on failure. The popped event is + * returned in @eventp. If no event is pending, NULL is placed in + * @eventp and 0 is returned. If an error is returned, @eventp is left + * untouched. */ _public_ int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp) { - acd->current = n_acd_event_node_free(acd->current); + NAcdEventNode *node, *t_node; - if (c_list_is_empty(&acd->events)) { - if (acd->state == N_ACD_STATE_INIT) - return N_ACD_E_STOPPED; - else - return N_ACD_E_DONE; - } - - acd->current = c_list_first_entry(&acd->events, NAcdEventNode, link); - c_list_unlink(&acd->current->link); - - if (eventp) - *eventp = &acd->current->event; - - return 0; -} - -static int n_acd_bind_socket(NAcd *acd, int s) { - /* - * Due to strict aliasing, we cannot get uint32_t/uint16_t pointers to - * acd->config.mac, so provide a union accessor. - */ - const union { - uint8_t u8[6]; - uint16_t u16[3]; - uint32_t u32[1]; - } mac = { - .u8 = { - acd->mac[0], - acd->mac[1], - acd->mac[2], - acd->mac[3], - acd->mac[4], - acd->mac[5], - }, - }; - struct sock_filter filter[] = { - /* - * Basic ARP header validation. Make sure the packet-length, - * wire type, protocol type, and address lengths are correct. - */ - BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */ - BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, sizeof(struct ether_arp), 1, 0), /* #packet >= #arp-packet ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hrd)), /* A <- header */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPHRD_ETHER, 1, 0), /* header == ethernet ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pro)), /* A <- protocol */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 1, 0), /* protocol == IP ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hln)), /* A <- hardware address length */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_addr), 1, 0), /* length == sizeof(ether_addr)? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pln)), /* A <- protocol address length */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct in_addr), 1, 0), /* length == sizeof(in_addr) ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_op)), /* A <- operation */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REQUEST, 2, 0), /* protocol == request ? */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REPLY, 1, 0), /* protocol == reply ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - - /* - * Sender hardware address must be different from ours. Note - * that BPF runs in big-endian mode, but assumes immediates are - * given in native-endian. This might look weird on 6-byte mac - * addresses, but is needed to revert the BPF magic. - */ - BPF_STMT(BPF_LD + BPF_IMM, be32toh(mac.u32[0])), /* A <- 4 bytes of client's MAC */ - BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */ - BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_sha)), /* A <- 4 bytes of SHA */ - BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* A xor X */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 6), /* A == 0 ? */ - BPF_STMT(BPF_LD + BPF_IMM, be16toh(mac.u16[2])), /* A <- remainder of client's MAC */ - BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */ - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, arp_sha) + 4), /* A <- remainder of SHA */ - BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* A xor X */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - - /* - * Sender protocol address or target protocol address must be - * equal to the one we care about. Again, immediates must be - * given in native-endian. - */ - BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)), /* A <- clients IP */ - BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */ - BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_spa)), /* A <- SPA */ - BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* X xor A */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */ - BPF_STMT(BPF_RET + BPF_K, 65535), /* return all */ - BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)), /* A <- clients IP */ - BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */ - BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_tpa)), /* A <- TPA */ - BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* X xor A */ - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */ - BPF_STMT(BPF_RET + BPF_K, 65535), /* return all */ - BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */ - }; - const struct sock_fprog fprog = { - .len = sizeof(filter) / sizeof(*filter), - .filter = filter, - }; - const struct sockaddr_ll address = { - .sll_family = AF_PACKET, - .sll_protocol = htobe16(ETH_P_ARP), - .sll_ifindex = acd->config.ifindex, - .sll_halen = ETH_ALEN, - .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - }; - int r; - - /* - * Install a packet filter that matches on the ARP header and - * addresses, to reduce the number of wake-ups to a minimum. - */ - r = setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog)); - if (r < 0) - return -n_acd_errno(); - - /* - * Bind the packet-socket to ETH_P_ARP and the specified network - * interface. - */ - r = bind(s, (struct sockaddr *)&address, sizeof(address)); - if (r < 0) - return -n_acd_errno(); - - return 0; -} - -static int n_acd_setup_socket(NAcd *acd) { - int r, s; - - s = socket(PF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0); - if (s < 0) - return -n_acd_errno(); - - r = n_acd_bind_socket(acd, s); - if (r < 0) - goto error; - - r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, s, - &(struct epoll_event){ - .events = EPOLLIN, - .data.u32 = N_ACD_EPOLL_SOCKET, - }); - if (r < 0) { - r = -n_acd_errno(); - goto error; - } - - acd->fd_socket = s; - return 0; - -error: - close(s); - return r; -} - -/** - * n_acd_start() - start the ACD engine - * @acd: ACD context - * @config: description of interface and desired IP address - * - * Start probing the given address on the given interface. - * - * The engine must not already be running, and there must not be - * any pending events. - * - * Returns: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the configuration - * was invalid, N_ACD_E_BUSY if the engine is running or there are - * pending events, or a negative error code on failure. - */ -_public_ int n_acd_start(NAcd *acd, NAcdConfig *config) { - uint64_t now, delay; - int r; - - if (config->ifindex <= 0 || - config->transport != N_ACD_TRANSPORT_ETHERNET || - config->n_mac != ETH_ALEN || - !memcmp(config->mac, (uint8_t[ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ETH_ALEN) || - !config->ip.s_addr) - return N_ACD_E_INVALID_ARGUMENT; - - if (acd->state != N_ACD_STATE_INIT || !c_list_is_empty(&acd->events)) - return N_ACD_E_BUSY; - - acd->config = *config; - memcpy(acd->mac, config->mac, config->n_mac); - acd->config.mac = acd->mac; - acd->timeout_multiplier = config->timeout_msec; - - r = n_acd_setup_socket(acd); - if (r < 0) - goto error; - - if (acd->timeout_multiplier) { - delay = 0; - acd->n_iteration = 0; - - if (acd->last_conflict != TIME_INFINITY) { - r = n_acd_now(&now); - if (r < 0) - goto error; - - if (now < acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC) - delay = acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC - now; + c_list_for_each_entry_safe(node, t_node, &acd->event_list, acd_link) { + if (node->is_public) { + n_acd_event_node_free(node); + continue; } - r = n_acd_schedule(acd, delay, acd->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_USEC); - if (r < 0) - goto error; - } else { - /* - * A zero timeout means we drop the probing alltogether, and behave as if - * the last probe succeeded immediately. - */ - acd->n_iteration = N_ACD_RFC_PROBE_NUM; - - r = n_acd_schedule(acd, 0, 0); - if (r < 0) - goto error; + node->is_public = true; + *eventp = &node->event; + return 0; } - acd->state = N_ACD_STATE_PROBING; - acd->defend = N_ACD_DEFEND_NEVER; - acd->last_defend = 0; - return 0; - -error: - n_acd_reset(acd); - return r; -} - -/** - * n_acd_stop() - stop the ACD engine - * @acd: ACD context - * - * Stop the engine. No new events may be triggered, but pending events are not - * flushed. Before calling n_acd_start() again all pending events must be popped. - * - * Return: 0 on success, negative error code on failure. - */ -_public_ int n_acd_stop(NAcd *acd) { - n_acd_reset(acd); + *eventp = NULL; return 0; } /** - * n_acd_announce() - announce the configured IP address - * @acd: ACD context - * @defend: defence policy - * - * Announce the IP address on the local link, and start defending it according - * to the given policy, which mut be one of N_ACD_DEFEND_ONCE, - * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS. - * - * This must be called after the engine in response to an N_ACD_EVENT_READY - * event, and only after the given address has been configured on the given - * interface. - * - * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defence policy - * is invalid, N_ACD_E_BUSY if this is not in response to a - * N_ACD_EVENT_READY event, or a negative error code on failure. + * XXX */ -_public_ int n_acd_announce(NAcd *acd, unsigned int defend) { - uint64_t now; - int r; - - if (defend >= _N_ACD_DEFEND_N) - return N_ACD_E_INVALID_ARGUMENT; - if (acd->state != N_ACD_STATE_CONFIGURING) - return N_ACD_E_BUSY; - - /* - * Sending announcements means we finished probing and use the address - * now. We therefore reset the conflict counter in case we adhered to - * the rate-limit. Since probing is properly delayed, a well-behaving - * client will always reset the conflict counter here. However, if you - * force-use an address regardless of conflicts, then this will not - * trigger and the conflict counter stays untouched. - */ - if (acd->last_conflict != TIME_INFINITY) { - r = n_acd_now(&now); - if (r < 0) - return r; - - if (now >= acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC) - acd->n_conflicts = 0; - } - - /* - * Instead of sending the first announcement here, we schedule an idle - * timer. This avoids possibly recursing into the user callback. We - * should never trigger callbacks from arbitrary stacks, but always - * restrict them to the dispatcher. - */ - r = n_acd_schedule(acd, 0, 0); - if (r < 0) - return r; - - acd->state = N_ACD_STATE_ANNOUNCING; - acd->defend = defend; - acd->n_iteration = 0; - return 0; +_public_ int n_acd_probe(NAcd *acd, NAcdProbe **probep, NAcdProbeConfig *config) { + return n_acd_probe_new(probep, acd, config); } diff --git a/src/n-acd.h b/src/n-acd.h index eb12a53eec..74b0aacb59 100644 --- a/src/n-acd.h +++ b/src/n-acd.h @@ -15,42 +15,23 @@ extern "C" { #include #include +typedef struct NAcd NAcd; +typedef struct NAcdConfig NAcdConfig; +typedef struct NAcdEvent NAcdEvent; +typedef struct NAcdProbe NAcdProbe; +typedef struct NAcdProbeConfig NAcdProbeConfig; + +#define N_ACD_TIMEOUT_RFC5227 (UINT64_C(9000)) + enum { _N_ACD_E_SUCCESS, - N_ACD_E_DONE, - N_ACD_E_STOPPED, N_ACD_E_PREEMPTED, - N_ACD_E_INVALID_ARGUMENT, - N_ACD_E_BUSY, + + _N_ACD_E_N, }; -typedef struct NAcd NAcd; - -typedef struct NAcdConfig { - int ifindex; - unsigned int transport; - const uint8_t *mac; - size_t n_mac; - struct in_addr ip; - uint64_t timeout_msec; -} NAcdConfig; - -typedef struct NAcdEvent { - unsigned int event; - union { - struct { - } ready, down; - struct { - uint16_t operation; - uint8_t *sender; - size_t n_sender; - struct in_addr target; - } used, defended, conflict; - }; -} NAcdEvent; - enum { N_ACD_TRANSPORT_ETHERNET, _N_ACD_TRANSPORT_N, @@ -72,21 +53,94 @@ enum { _N_ACD_DEFEND_N, }; -int n_acd_new(NAcd **acdp); -NAcd *n_acd_free(NAcd *acd); +struct NAcdEvent { + unsigned int event; + union { + struct { + NAcdProbe *probe; + } ready; + struct { + } down; + struct { + NAcdProbe *probe; + uint8_t *sender; + size_t n_sender; + } used, defended, conflict; + }; +}; + +/* configs */ + +int n_acd_config_new(NAcdConfig **configp); +NAcdConfig *n_acd_config_free(NAcdConfig *config); + +void n_acd_config_set_ifindex(NAcdConfig *config, int ifindex); +void n_acd_config_set_transport(NAcdConfig *config, unsigned int transport); +void n_acd_config_set_mac(NAcdConfig *config, const uint8_t *mac, size_t n_mac); + +int n_acd_probe_config_new(NAcdProbeConfig **configp); +NAcdProbeConfig *n_acd_probe_config_free(NAcdProbeConfig *config); + +void n_acd_probe_config_set_ip(NAcdProbeConfig *config, struct in_addr ip); +void n_acd_probe_config_set_timeout(NAcdProbeConfig *config, uint64_t msecs); + +/* contexts */ + +int n_acd_new(NAcd **acdp, NAcdConfig *config); +NAcd *n_acd_ref(NAcd *acd); +NAcd *n_acd_unref(NAcd *acd); void n_acd_get_fd(NAcd *acd, int *fdp); - int n_acd_dispatch(NAcd *acd); int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp); -int n_acd_announce(NAcd *acd, unsigned int defend); -int n_acd_start(NAcd *acd, NAcdConfig *config); -int n_acd_stop(NAcd *acd); +int n_acd_probe(NAcd *acd, NAcdProbe **probep, NAcdProbeConfig *config); -static inline void n_acd_freep(NAcd **acd) { +/* probes */ + +NAcdProbe *n_acd_probe_free(NAcdProbe *probe); + +void n_acd_probe_set_userdata(NAcdProbe *probe, void *userdata); +void n_acd_probe_get_userdata(NAcdProbe *probe, void **userdatap); + +int n_acd_probe_announce(NAcdProbe *probe, unsigned int defend); + +/* inline helpers */ + +static inline void n_acd_config_freep(NAcdConfig **config) { + if (*config) + n_acd_config_free(*config); +} + +static inline void n_acd_config_freev(NAcdConfig *config) { + n_acd_config_free(config); +} + +static inline void n_acd_probe_config_freep(NAcdProbeConfig **config) { + if (*config) + n_acd_probe_config_free(*config); +} + +static inline void n_acd_probe_config_freev(NAcdProbeConfig *config) { + n_acd_probe_config_free(config); +} + +static inline void n_acd_unrefp(NAcd **acd) { if (*acd) - n_acd_free(*acd); + n_acd_unref(*acd); +} + +static inline void n_acd_unrefv(NAcd *acd) { + n_acd_unref(acd); +} + +static inline void n_acd_probe_freep(NAcdProbe **probe) { + if (*probe) + n_acd_probe_free(*probe); +} + +static inline void n_acd_probe_freev(NAcdProbe *probe) { + n_acd_probe_free(probe); } #ifdef __cplusplus diff --git a/src/test-api.c b/src/test-api.c index 697181abaa..e16de48b73 100644 --- a/src/test-api.c +++ b/src/test-api.c @@ -7,67 +7,90 @@ #include #include "test.h" -static void test_api_constants(void) { - assert(N_ACD_DEFEND_NEVER != _N_ACD_DEFEND_N); - assert(N_ACD_DEFEND_ONCE != _N_ACD_DEFEND_N); - assert(N_ACD_DEFEND_ALWAYS != _N_ACD_DEFEND_N); +static void test_api(void) { + NAcdConfig *config = NULL; + NAcd *acd = NULL; + int r; + + assert(N_ACD_E_PREEMPTED); + assert(N_ACD_E_INVALID_ARGUMENT); + + assert(N_ACD_TRANSPORT_ETHERNET != _N_ACD_TRANSPORT_N); assert(N_ACD_EVENT_READY != _N_ACD_EVENT_N); assert(N_ACD_EVENT_USED != _N_ACD_EVENT_N); assert(N_ACD_EVENT_DEFENDED != _N_ACD_EVENT_N); assert(N_ACD_EVENT_CONFLICT != _N_ACD_EVENT_N); assert(N_ACD_EVENT_DOWN != _N_ACD_EVENT_N); -} -static void test_api_management(void) { - NAcd *acd = NULL; - int r; + assert(N_ACD_DEFEND_NEVER != _N_ACD_DEFEND_N); + assert(N_ACD_DEFEND_ONCE != _N_ACD_DEFEND_N); + assert(N_ACD_DEFEND_ALWAYS != _N_ACD_DEFEND_N); - /* new/free/freep */ + n_acd_config_freep(&config); - n_acd_freep(&acd); - - r = n_acd_new(&acd); + r = n_acd_config_new(&config); assert(!r); - n_acd_free(acd); -} + n_acd_config_set_ifindex(config, 1); + n_acd_config_set_transport(config, N_ACD_TRANSPORT_ETHERNET); + n_acd_config_set_mac(config, (uint8_t[6]){ }, 6); -static void test_api_runtime(void) { - NAcdConfig config = { - .ifindex = 1, - .transport = N_ACD_TRANSPORT_ETHERNET, - .mac = (uint8_t[]){ 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54 }, - .n_mac = ETH_ALEN, - .ip = { htobe32((127 << 24) | (1 << 0)) }, - .timeout_msec = 100, - }; - NAcd *acd; - int r; + { + NAcdEvent *event; + int fd; - /* get_fd/dispatch/pop_event/start/stop/announce */ + n_acd_unrefp(&acd); + n_acd_ref(NULL); - r = n_acd_new(&acd); - assert(!r); + r = n_acd_new(&acd, config); + assert(!r); - n_acd_get_fd(acd, &r); - assert(r >= 0); - r = n_acd_dispatch(acd); - assert(!r); - r = n_acd_pop_event(acd, NULL); - assert(r == N_ACD_E_STOPPED); - r = n_acd_start(acd, &config); - assert(!r); - r = n_acd_start(acd, &config); - assert(r == N_ACD_E_BUSY); - r = n_acd_pop_event(acd, NULL); - assert(r == N_ACD_E_DONE); - r = n_acd_stop(acd); - assert(!r); - r = n_acd_announce(acd, N_ACD_DEFEND_NEVER); - assert(r == N_ACD_E_BUSY); + n_acd_get_fd(acd, &fd); + n_acd_dispatch(acd); + n_acd_pop_event(acd, &event); - n_acd_free(acd); + { + NAcdProbeConfig *c = NULL; + + n_acd_probe_config_freep(&c); + + r = n_acd_probe_config_new(&c); + assert(!r); + + n_acd_probe_config_set_ip(c, (struct in_addr){ 1 }); + n_acd_probe_config_set_timeout(c, N_ACD_TIMEOUT_RFC5227); + + { + NAcdProbe *probe = NULL; + void *userdata; + + r = n_acd_probe(acd, &probe, c); + assert(!r); + + n_acd_probe_get_userdata(probe, &userdata); + assert(userdata == NULL); + n_acd_probe_set_userdata(probe, acd); + n_acd_probe_get_userdata(probe, &userdata); + assert(userdata == acd); + + r = n_acd_probe_announce(probe, N_ACD_DEFEND_ONCE); + assert(!r); + + n_acd_probe_free(probe); + n_acd_probe_freev(NULL); + } + + n_acd_probe_config_free(c); + n_acd_probe_config_freev(NULL); + } + + n_acd_unref(acd); + n_acd_unrefv(NULL); + } + + n_acd_config_free(config); + n_acd_config_freev(NULL); } int main(int argc, char **argv) { @@ -77,8 +100,6 @@ int main(int argc, char **argv) { if (r) return r; - test_api_constants(); - test_api_management(); - test_api_runtime(); + test_api(); return 0; } diff --git a/src/test-basic.c b/src/test-basic.c deleted file mode 100644 index fa85cb0549..0000000000 --- a/src/test-basic.c +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Basic Tests - */ - -#include -#include -#include -#include -#include "n-acd.h" - -int main(int argc, char **argv) { - return 0; -} diff --git a/src/test-bpf.c b/src/test-bpf.c new file mode 100644 index 0000000000..aa8b20ec30 --- /dev/null +++ b/src/test-bpf.c @@ -0,0 +1,228 @@ +/* + * eBPF socket filter tests + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "n-acd.h" +#include "n-acd-private.h" +#include "test.h" + +#define ETHER_ARP_PACKET_INIT(_op, _mac, _sip, _tip) { \ + .ea_hdr = { \ + .ar_hrd = htobe16(ARPHRD_ETHER), \ + .ar_pro = htobe16(ETHERTYPE_IP), \ + .ar_hln = 6, \ + .ar_pln = 4, \ + .ar_op = htobe16(_op), \ + }, \ + .arp_sha[0] = (_mac)->ether_addr_octet[0], \ + .arp_sha[1] = (_mac)->ether_addr_octet[1], \ + .arp_sha[2] = (_mac)->ether_addr_octet[2], \ + .arp_sha[3] = (_mac)->ether_addr_octet[3], \ + .arp_sha[4] = (_mac)->ether_addr_octet[4], \ + .arp_sha[5] = (_mac)->ether_addr_octet[5], \ + .arp_spa[0] = (be32toh((_sip)->s_addr) >> 24) & 0xff, \ + .arp_spa[1] = (be32toh((_sip)->s_addr) >> 16) & 0xff, \ + .arp_spa[2] = (be32toh((_sip)->s_addr) >> 8) & 0xff, \ + .arp_spa[3] = be32toh((_sip)->s_addr) & 0xff, \ + .arp_tpa[0] = (be32toh((_tip)->s_addr) >> 24) & 0xff, \ + .arp_tpa[1] = (be32toh((_tip)->s_addr) >> 16) & 0xff, \ + .arp_tpa[2] = (be32toh((_tip)->s_addr) >> 8) & 0xff, \ + .arp_tpa[3] = be32toh((_tip)->s_addr) & 0xff, \ + } + +static void test_map(void) { + int r, mapfd = -1; + struct in_addr addr = { 1 }; + + r = n_acd_bpf_map_create(&mapfd, 8); + assert(r >= 0); + assert(mapfd >= 0); + + r = n_acd_bpf_map_remove(mapfd, &addr); + assert(r == -ENOENT); + + r = n_acd_bpf_map_add(mapfd, &addr); + assert(r >= 0); + + r = n_acd_bpf_map_add(mapfd, &addr); + assert(r == -EEXIST); + + r = n_acd_bpf_map_remove(mapfd, &addr); + assert(r >= 0); + + r = n_acd_bpf_map_remove(mapfd, &addr); + assert(r == -ENOENT); + + close(mapfd); +} + +static void verify_success(struct ether_arp *packet, int out_fd, int in_fd) { + uint8_t buf[sizeof(struct ether_arp)]; + int r; + + r = send(out_fd, packet, sizeof(struct ether_arp), 0); + assert(r == sizeof(struct ether_arp)); + + r = recv(in_fd, buf, sizeof(buf), 0); + assert(r == sizeof(struct ether_arp)); +} + +static void verify_failure(struct ether_arp *packet, int out_fd, int in_fd) { + uint8_t buf[sizeof(struct ether_arp)]; + int r; + + r = send(out_fd, packet, sizeof(struct ether_arp), 0); + assert(r == sizeof(struct ether_arp)); + + r = recv(in_fd, buf, sizeof(buf), 0); + assert(r < 0); + assert(errno == EAGAIN); +} + +static void test_filter(void) { + uint8_t buf[sizeof(struct ether_arp) + 1]; + struct ether_addr mac1 = { { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 } }; + struct ether_addr mac2 = { { 0x01, 0x02, 0x03, 0x04, 0x05, 0x07 } }; + struct in_addr ip0 = { 0 }; + struct in_addr ip1 = { 1 }; + struct in_addr ip2 = { 2 }; + struct ether_arp *packet = (struct ether_arp *)buf; + int r, mapfd = -1, progfd = -1, pair[2]; + + r = n_acd_bpf_map_create(&mapfd, 1); + assert(r >= 0); + + r = n_acd_bpf_compile(&progfd, mapfd, &mac1); + assert(r >= 0); + assert(progfd >= 0); + + r = socketpair(AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC | SOCK_NONBLOCK, 0, pair); + assert(r >= 0); + + r = setsockopt(pair[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, + sizeof(progfd)); + assert(r >= 0); + + r = n_acd_bpf_map_add(mapfd, &ip1); + assert(r >= 0); + + /* valid */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + verify_success(packet, pair[0], pair[1]); + + /* valid: reply instead of request */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REPLY, &mac2, &ip1, &ip2); + verify_success(packet, pair[0], pair[1]); + + /* valid: to us instead of from us */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip0, &ip1); + verify_success(packet, pair[0], pair[1]); + + /* invalid header type */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_hrd += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid protocol */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_pro += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid hw addr length */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_hln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid protocol addr length */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + packet->arp_pln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* invalid operation */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_NAK, &mac2, &ip1, &ip2); + packet->arp_hln += 1; + verify_failure(packet, pair[0], pair[1]); + + /* own mac */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac1, &ip1, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* not to, nor from us, with source */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip2, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* not to, nor from us, without source */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip0, &ip2); + verify_failure(packet, pair[0], pair[1]); + + /* to us instead of from us, but reply */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REPLY, &mac2, &ip0, &ip1); + verify_failure(packet, pair[0], pair[1]); + + /* long */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp) + 1, 0); + assert(r == sizeof(struct ether_arp) + 1); + + r = recv(pair[1], buf, sizeof(buf), 0); + assert(r == sizeof(struct ether_arp)); + + /* short */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp) - 1, 0); + assert(r == sizeof(struct ether_arp) - 1); + + r = recv(pair[1], buf, sizeof(buf), 0); + assert(r < 0); + assert(errno == EAGAIN); + + /* + * Send one packet before and one packet after modifying the map, + * verify that the modification applies at the time of send(), not recv(). + */ + *packet = (struct ether_arp)ETHER_ARP_PACKET_INIT(ARPOP_REQUEST, &mac2, &ip1, &ip2); + r = send(pair[0], buf, sizeof(struct ether_arp), 0); + assert(r == sizeof(struct ether_arp)); + + r = n_acd_bpf_map_remove(mapfd, &ip1); + assert(r >= 0); + + r = send(pair[0], buf, sizeof(struct ether_arp), 0); + assert(r == sizeof(struct ether_arp)); + + r = recv(pair[1], buf, sizeof(buf), 0); + assert(r == sizeof(struct ether_arp)); + + r = recv(pair[1], buf, sizeof(buf), 0); + assert(r < 0); + assert(errno == EAGAIN); + + close(pair[0]); + close(pair[1]); + close(progfd); + close(mapfd); +} + +int main(int argc, char **argv) { + int r; + + r = test_setup(); + if (r) + return r; + + test_map(); + test_filter(); + + return 0; +} diff --git a/src/test-loopback.c b/src/test-loopback.c index 98195c93a5..5c01d65b68 100644 --- a/src/test-loopback.c +++ b/src/test-loopback.c @@ -9,44 +9,62 @@ #include "test.h" static void test_loopback(int ifindex, uint8_t *mac, size_t n_mac) { - NAcdConfig config = { - .ifindex = ifindex, - .transport = N_ACD_TRANSPORT_ETHERNET, - .mac = mac, - .n_mac = n_mac, - .ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }, - .timeout_msec = 100, - }; - struct pollfd pfds; + NAcdConfig *config; NAcd *acd; + struct pollfd pfds; int r, fd; - r = n_acd_new(&acd); + r = n_acd_config_new(&config); assert(!r); - n_acd_get_fd(acd, &fd); - r = n_acd_start(acd, &config); + n_acd_config_set_ifindex(config, ifindex); + n_acd_config_set_transport(config, N_ACD_TRANSPORT_ETHERNET); + n_acd_config_set_mac(config, mac, n_mac); + + r = n_acd_new(&acd, config); assert(!r); - for (;;) { - NAcdEvent *event; - pfds = (struct pollfd){ .fd = fd, .events = POLLIN }; - r = poll(&pfds, 1, -1); - assert(r >= 0); + n_acd_config_free(config); - r = n_acd_dispatch(acd); + { + NAcdProbeConfig *probe_config; + NAcdProbe *probe; + struct in_addr ip = { htobe32((192 << 24) | (168 << 16) | (1 << 0)) }; + + r = n_acd_probe_config_new(&probe_config); assert(!r); - r = n_acd_pop_event(acd, &event); - if (!r) { - assert(event->event == N_ACD_EVENT_READY); - break; - } else { - assert(r == N_ACD_E_DONE); + n_acd_probe_config_set_ip(probe_config, ip); + n_acd_probe_config_set_timeout(probe_config, 100); + + r = n_acd_probe(acd, &probe, probe_config); + assert(!r); + + n_acd_probe_config_free(probe_config); + + n_acd_get_fd(acd, &fd); + + for (;;) { + NAcdEvent *event; + pfds = (struct pollfd){ .fd = fd, .events = POLLIN }; + r = poll(&pfds, 1, -1); + assert(r >= 0); + + r = n_acd_dispatch(acd); + assert(!r); + + r = n_acd_pop_event(acd, &event); + assert(!r); + if (event) { + assert(event->event == N_ACD_EVENT_READY); + break; + } } + + n_acd_probe_free(probe); } - n_acd_free(acd); + n_acd_unref(acd); } int main(int argc, char **argv) { @@ -57,9 +75,7 @@ int main(int argc, char **argv) { if (r) return r; - r = system("ip link set lo up"); - assert(r == 0); - test_if_query("lo", &ifindex, &mac); + test_loopback_up(&ifindex, &mac); test_loopback(ifindex, mac.ether_addr_octet, sizeof(mac.ether_addr_octet)); return 0; diff --git a/src/test-veth.c b/src/test-veth.c new file mode 100644 index 0000000000..64724f5e9f --- /dev/null +++ b/src/test-veth.c @@ -0,0 +1,238 @@ +/* + * Test on a veth link + * + * This essentially mimics a real nework with two peers. + * + * Run one ACD context on each end of the tunnel. On one end probe for N, + * addresses on the other end pre-configure N/3 of the same addresses and probe + * for another N/3 of the addresses. + * + * Verify that in the case of simultaneous probes of the same address at most one + * succeed, in the case of probing for a configured address it always fails, and + * probing for a non-existent address always succeeds. + * + * Make sure to keep N fairly high as the protocol is probabilistic, and we also + * want to verify that resizing the internal maps works correctly. + */ + +#include +#include "test.h" + +#define TEST_ACD_N_PROBES (9) + +typedef enum { + TEST_ACD_STATE_UNKNOWN, + TEST_ACD_STATE_USED, + TEST_ACD_STATE_READY, +} TestAcdState; + +static void test_veth(int ifindex1, uint8_t *mac1, size_t n_mac1, + int ifindex2, uint8_t *mac2, size_t n_mac2) { + NAcdConfig *config; + NAcd *acd1, *acd2; + NAcdProbe *probes1[TEST_ACD_N_PROBES]; + NAcdProbe *probes2[TEST_ACD_N_PROBES]; + unsigned long state1, state2; + size_t n_running = 0; + int r; + + r = n_acd_config_new(&config); + assert(!r); + + n_acd_config_set_transport(config, N_ACD_TRANSPORT_ETHERNET); + + n_acd_config_set_ifindex(config, ifindex1); + n_acd_config_set_mac(config, mac1, n_mac1); + r = n_acd_new(&acd1, config); + assert(!r); + + n_acd_config_set_ifindex(config, ifindex2); + n_acd_config_set_mac(config, mac2, n_mac2); + r = n_acd_new(&acd2, config); + assert(!r); + + n_acd_config_free(config); + + { + NAcdProbeConfig *probe_config; + + r = n_acd_probe_config_new(&probe_config); + assert(!r); + n_acd_probe_config_set_timeout(probe_config, 64); + + assert(TEST_ACD_N_PROBES <= 10 << 24); + + for (size_t i = 0; i < TEST_ACD_N_PROBES; ++i) { + struct in_addr ip = { htobe32((10 << 24) | i) }; + + n_acd_probe_config_set_ip(probe_config, ip); + + switch (i % 3) { + case 0: + /* + * Probe on one side, and leave the address + * unset on the other. The probe must succeed. + */ + + break; + case 1: + /* + * Preconfigure the address on one side, and + * probe on the other. The probe must fail. + */ + test_add_child_ip(&ip); + + break; + case 2: + /* + * Probe both sides for the same address, at + * most one may succeed. + */ + r = n_acd_probe(acd2, &probes2[i], probe_config); + assert(!r); + + ++n_running; + + break; + } + + r = n_acd_probe(acd1, &probes1[i], probe_config); + assert(!r); + + ++n_running; + } + + n_acd_probe_config_free(probe_config); + + while (n_running > 0) { + NAcdEvent *event; + struct pollfd pfds[2] = { + { .events = POLLIN }, + { .events = POLLIN }, + }; + + n_acd_get_fd(acd1, &pfds[0].fd); + n_acd_get_fd(acd2, &pfds[1].fd); + + r = poll(pfds, 2, -1); + assert(r >= 0); + + if (pfds[0].revents & POLLIN) { + r = n_acd_dispatch(acd1); + assert(!r || r == N_ACD_E_PREEMPTED); + + for (;;) { + r = n_acd_pop_event(acd1, &event); + assert(!r); + if (event) { + switch (event->event) { + case N_ACD_EVENT_READY: + n_acd_probe_get_userdata(event->ready.probe, (void**)&state1); + assert(state1 == TEST_ACD_STATE_UNKNOWN); + state1 = TEST_ACD_STATE_READY; + n_acd_probe_set_userdata(event->ready.probe, (void*)state1); + + break; + case N_ACD_EVENT_USED: + n_acd_probe_get_userdata(event->used.probe, (void**)&state1); + assert(state1 == TEST_ACD_STATE_UNKNOWN); + state1 = TEST_ACD_STATE_USED; + n_acd_probe_set_userdata(event->used.probe, (void*)state1); + + break; + default: + assert(0); + } + + --n_running; + } else { + break; + } + } + } + + if (pfds[1].revents & POLLIN) { + r = n_acd_dispatch(acd2); + assert(!r || r == N_ACD_E_PREEMPTED); + + for (;;) { + r = n_acd_pop_event(acd2, &event); + assert(!r); + if (event) { + switch (event->event) { + case N_ACD_EVENT_READY: + n_acd_probe_get_userdata(event->ready.probe, (void**)&state2); + assert(state2 == TEST_ACD_STATE_UNKNOWN); + state2 = TEST_ACD_STATE_READY; + n_acd_probe_set_userdata(event->ready.probe, (void*)state2); + + break; + case N_ACD_EVENT_USED: + n_acd_probe_get_userdata(event->used.probe, (void**)&state2); + assert(state2 == TEST_ACD_STATE_UNKNOWN); + state2 = TEST_ACD_STATE_USED; + n_acd_probe_set_userdata(event->used.probe, (void*)state2); + + break; + default: + assert(0); + } + + --n_running; + } else { + break; + } + } + } + } + + for (size_t i = 0; i < TEST_ACD_N_PROBES; ++i) { + struct in_addr ip = { htobe32((10 << 24) | i) }; + + switch (i % 3) { + case 0: + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + assert(state1 == TEST_ACD_STATE_READY); + + break; + case 1: + test_del_child_ip(&ip); + + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + assert(state1 == TEST_ACD_STATE_USED); + + break; + case 2: + n_acd_probe_get_userdata(probes1[i], (void **)&state1); + n_acd_probe_get_userdata(probes2[i], (void **)&state2); + assert(state1 != TEST_ACD_STATE_UNKNOWN); + assert(state2 != TEST_ACD_STATE_UNKNOWN); + assert(state1 == TEST_ACD_STATE_USED || state2 == TEST_ACD_STATE_USED); + n_acd_probe_free(probes2[i]); + + break; + } + n_acd_probe_free(probes1[i]); + } + } + + n_acd_unref(acd2); + n_acd_unref(acd1); +} + +int main(int argc, char **argv) { + struct ether_addr mac1, mac2; + int r, ifindex1, ifindex2; + + r = test_setup(); + if (r) + return r; + + test_veth_new(&ifindex1, &mac1, &ifindex2, &mac2); + for (unsigned int i = 0; i < 8; ++i) { + test_veth(ifindex1, mac1.ether_addr_octet, sizeof(mac1.ether_addr_octet), + ifindex2, mac2.ether_addr_octet, sizeof(mac2.ether_addr_octet)); + } + + return 0; +} diff --git a/src/test.h b/src/test.h index 92315858ba..f2cb801aab 100644 --- a/src/test.h +++ b/src/test.h @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -22,6 +24,32 @@ #include #include "n-acd.h" +static inline void test_add_child_ip(const struct in_addr *addr) { + char *p; + int r; + + r = asprintf(&p, "ip addr add dev veth1 %s/8", inet_ntoa(*addr)); + assert(r >= 0); + + r = system(p); + assert(r >= 0); + + free(p); +} + +static inline void test_del_child_ip(const struct in_addr *addr) { + char *p; + int r; + + r = asprintf(&p, "ip addr del dev veth1 %s/8", inet_ntoa(*addr)); + assert(r >= 0); + + r = system(p); + assert(r >= 0); + + free(p); +} + static inline void test_if_query(const char *name, int *indexp, struct ether_addr *macp) { struct ifreq ifr = {}; size_t l; @@ -39,7 +67,7 @@ static inline void test_if_query(const char *name, int *indexp, struct ether_add s = socket(AF_INET, SOCK_DGRAM, 0); assert(s >= 0); - strncpy(ifr.ifr_name, name, l); + strncpy(ifr.ifr_name, name, l + 1); r = ioctl(s, SIOCGIFHWADDR, &ifr); assert(r >= 0); @@ -84,6 +112,15 @@ static inline void test_veth_new(int *parent_indexp, test_if_query("veth1", child_indexp, child_macp); } +static inline void test_loopback_up(int *indexp, struct ether_addr *macp) { + int r; + + r = system("ip link set lo up"); + assert(r == 0); + + test_if_query("lo", indexp, macp); +} + static inline int test_setup(void) { int r; diff --git a/src/util/test-timer.c b/src/util/test-timer.c new file mode 100644 index 0000000000..9cc3109b60 --- /dev/null +++ b/src/util/test-timer.c @@ -0,0 +1,176 @@ +/* + * Tests for timer utility library + */ + +#include +#include + +#include +#include +#include +#include +#include "timer.h" + +#define N_TIMEOUTS (10000) + +static void test_api(void) { + Timer timer = TIMER_NULL(timer); + Timeout t1 = TIMEOUT_INIT(t1), t2 = TIMEOUT_INIT(t2), *t; + int r; + + r = timer_init(&timer); + assert(!r); + + timeout_schedule(&t1, &timer, 1); + timeout_schedule(&t2, &timer, 2); + + r = timer_pop_timeout(&timer, 10, &t); + assert(!r); + assert(t == &t1); + + timeout_unschedule(&t2); + + r = timer_pop_timeout(&timer, 10, &t); + assert(!r); + assert(!t); + + timer_deinit(&timer); +} + +static void test_pop(void) { + Timer timer = TIMER_NULL(timer); + Timeout timeouts[N_TIMEOUTS] = {}; + uint64_t times[N_TIMEOUTS] = {}; + size_t n_timeouts = 0; + bool armed; + Timeout *t; + int r; + + r = timer_init(&timer); + assert(!r); + + for(size_t i = 0; i < N_TIMEOUTS; ++i) { + timeouts[i] = (Timeout)TIMEOUT_INIT(timeouts[i]); + times[i] = rand() % 128 + 1; + timeout_schedule(&timeouts[i], &timer, times[i]); + } + + armed = true; + + for(size_t i = 0; i <= 128; ++i) { + if (armed) { + struct pollfd pfd = { + .fd = timer.fd, + .events = POLLIN, + }; + uint64_t count; + + r = poll(&pfd, 1, -1); + assert(r == 1); + + r = read(timer.fd, &count, sizeof(count)); + assert(r == sizeof(count)); + assert(count == 1); + armed = false; + } + + for (;;) { + uint64_t current_time; + + r = timer_pop_timeout(&timer, i, &t); + assert(!r); + if (!t) { + timer_rearm(&timer); + break; + } + + current_time = times[t - timeouts]; + assert(current_time == i); + ++n_timeouts; + armed = true; + } + } + + assert(n_timeouts == N_TIMEOUTS); + + r = timer_pop_timeout(&timer, (uint64_t)-1, &t); + assert(!r); + assert(!t); + + timer_deinit(&timer); +} + +void test_arm(void) { + struct itimerspec spec = { + .it_value = { + .tv_sec = 1000, + }, + }; + int fd1, fd2, r; + + fd1 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + assert(fd1 >= 0); + + fd2 = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + assert(fd1 >= 0); + + r = timerfd_settime(fd1, 0, &spec, NULL); + assert(r >= 0); + + r = timerfd_settime(fd2, 0, &spec, NULL); + assert(r >= 0); + + r = timerfd_gettime(fd1, &spec); + assert(r >= 0); + assert(spec.it_value.tv_sec); + + r = timerfd_gettime(fd2, &spec); + assert(r >= 0); + assert(spec.it_value.tv_sec); + + spec = (struct itimerspec){}; + + r = timerfd_settime(fd1, 0, &spec, NULL); + assert(r >= 0); + + r = timerfd_gettime(fd1, &spec); + assert(r >= 0); + assert(!spec.it_value.tv_sec); + assert(!spec.it_value.tv_nsec); + + r = timerfd_gettime(fd2, &spec); + assert(r >= 0); + assert(spec.it_value.tv_sec); + + spec = (struct itimerspec){ .it_value = { .tv_nsec = 1, }, }; + + r = timerfd_settime(fd1, 0, &spec, NULL); + assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd1, .events = POLLIN }, 1, -1); + assert(r == 1); + + r = timerfd_settime(fd2, 0, &spec, NULL); + assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd2, .events = POLLIN }, 1, -1); + assert(r == 1); + + spec = (struct itimerspec){}; + + r = timerfd_settime(fd1, 0, &spec, NULL); + assert(r >= 0); + + r = poll(&(struct pollfd) { .fd = fd2, .events = POLLIN }, 1, -1); + assert(r == 1); + + close(fd2); + close(fd1); +} + +int main(int argc, char **argv) { + test_arm(); + test_api(); + test_pop(); + return 0; +} diff --git a/src/util/timer.c b/src/util/timer.c new file mode 100644 index 0000000000..c995ba400f --- /dev/null +++ b/src/util/timer.c @@ -0,0 +1,189 @@ +/* + * Timer Utility Library + */ + +#include +#include +#include +#include +#include +#include +#include "timer.h" + +int timer_init(Timer *timer) { + clockid_t clock = CLOCK_BOOTTIME; + int r; + + r = timerfd_create(clock, TFD_CLOEXEC | TFD_NONBLOCK); + if (r < 0 && errno == EINVAL) { + clock = CLOCK_MONOTONIC; + r = timerfd_create(clock, TFD_CLOEXEC | TFD_NONBLOCK); + } + if (r < 0) + return -errno; + + *timer = (Timer)TIMER_NULL(*timer); + timer->fd = r; + timer->clock = clock; + + return 0; +} + +void timer_deinit(Timer *timer) { + assert(c_rbtree_is_empty(&timer->tree)); + + if (timer->fd >= 0) { + close(timer->fd); + timer->fd = -1; + } +} + +void timer_now(Timer *timer, uint64_t *nowp) { + struct timespec ts; + int r; + + r = clock_gettime(timer->clock, &ts); + assert(r >= 0); + + *nowp = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +} + +void timer_rearm(Timer *timer) { + uint64_t time; + Timeout *timeout; + int r; + + /* + * A timeout value of 0 clears the timer, we sholud only set that if + * no timout exists in the tree. + */ + + timeout = c_rbnode_entry(c_rbtree_first(&timer->tree), Timeout, node); + assert(!timeout || timeout->timeout); + + time = timeout ? timeout->timeout : 0; + + if (time != timer->scheduled_timeout) { + r = timerfd_settime(timer->fd, + TFD_TIMER_ABSTIME, + &(struct itimerspec){ + .it_value = { + .tv_sec = time / UINT64_C(1000000000), + .tv_nsec = time % UINT64_C(1000000000), + }, + }, + NULL); + assert(r >= 0); + + timer->scheduled_timeout = time; + } +} + +int timer_read(Timer *timer) { + uint64_t v; + int r; + + r = read(timer->fd, &v, sizeof(v)); + if (r < 0) { + if (errno == EAGAIN) { + /* + * No more pending events. + */ + return 0; + } else { + /* + * Something failed. We use CLOCK_BOOTTIME/MONOTONIC, + * so ECANCELED cannot happen. Hence, there is no + * error that we could gracefully handle. Fail hard + * and let the caller deal with it. + */ + return -errno; + } + } else if (r != sizeof(v) || v == 0) { + /* + * Kernel guarantees 8-byte reads, and only to return + * data if at least one timer triggered; fail hard if + * it suddenly starts doing weird shit. + */ + return -EIO; + } + + return TIMER_E_TRIGGERED; +} + + +int timer_pop_timeout(Timer *timer, uint64_t until, Timeout **timeoutp) { + Timeout *timeout; + + /* + * If the first timeout is scheduled before @until, then unlink + * it and return it. Otherwise, return NULL. + */ + timeout = c_rbnode_entry(c_rbtree_first(&timer->tree), Timeout, node); + if (timeout && timeout->timeout <= until) { + c_rbnode_unlink(&timeout->node); + timeout->timeout = 0; + *timeoutp = timeout; + } else { + *timeoutp = NULL; + } + + return 0; +} + +void timeout_schedule(Timeout *timeout, Timer *timer, uint64_t time) { + + assert(time); + + /* + * In case @timeout was already scheduled, remove it from the + * tree. If we are moving it to a new timer, rearm the old one. + */ + if (timeout->timer) { + c_rbnode_unlink(&timeout->node); + if (timeout->timer != timer) + timer_rearm(timeout->timer); + } + timeout->timer = timer; + timeout->timeout = time; + + /* + * Now insert it back into the tree in the correct new position. + * We allow duplicates in the tree, so this insertion is open-coded. + */ + { + Timeout *other; + CRBNode **slot, *parent; + + slot = &timer->tree.root; + parent = NULL; + while (*slot) { + other = c_rbnode_entry(*slot, Timeout, node); + parent = *slot; + if (timeout->timeout < other->timeout) + slot = &(*slot)->left; + else + slot = &(*slot)->right; + } + + c_rbtree_add(&timer->tree, parent, slot, &timeout->node); + } + + /* + * Rearm the timer as we updated the timeout tree. + */ + timer_rearm(timer); +} + +void timeout_unschedule(Timeout *timeout) { + Timer *timer = timeout->timer; + + if (!timer) + return; + + c_rbnode_unlink(&timeout->node); + timeout->timeout = 0; + timeout->timer = NULL; + + timer_rearm(timer); +} diff --git a/src/util/timer.h b/src/util/timer.h new file mode 100644 index 0000000000..2acc99e379 --- /dev/null +++ b/src/util/timer.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include + +typedef struct Timer Timer; +typedef struct Timeout Timeout; + +enum { + _TIMER_E_SUCCESS, + + TIMER_E_TRIGGERED, + + _TIMER_E_N, +}; + +struct Timer { + int fd; + clockid_t clock; + CRBTree tree; + uint64_t scheduled_timeout; +}; + +#define TIMER_NULL(_x) { \ + .fd = -1, \ + .tree = C_RBTREE_INIT, \ + } + +struct Timeout { + Timer *timer; + CRBNode node; + uint64_t timeout; +}; + +#define TIMEOUT_INIT(_x) { \ + .node = C_RBNODE_INIT((_x).node), \ + } + +int timer_init(Timer *timer); +void timer_deinit(Timer *timer); + +void timer_now(Timer *timer, uint64_t *nowp); + +int timer_pop_timeout(Timer *timer, uint64_t now, Timeout **timerp); +void timer_rearm(Timer *timer); +int timer_read(Timer *timer); + +void timeout_schedule(Timeout *timeout, Timer *timer, uint64_t time); +void timeout_unschedule(Timeout *timeout); + diff --git a/subprojects/c-list b/subprojects/c-list index 72c59181d6..dda36d30c7 160000 --- a/subprojects/c-list +++ b/subprojects/c-list @@ -1 +1 @@ -Subproject commit 72c59181d677a3f50b201d51f190b1bff02d4279 +Subproject commit dda36d30c7d655b4d61358519168fa7ce0e9dae9 diff --git a/subprojects/c-rbtree b/subprojects/c-rbtree new file mode 160000 index 0000000000..bf627e0c32 --- /dev/null +++ b/subprojects/c-rbtree @@ -0,0 +1 @@ +Subproject commit bf627e0c32241915108f66ad9738444e4d045b45 diff --git a/subprojects/c-siphash b/subprojects/c-siphash index e01ab640dc..b24d2e2048 160000 --- a/subprojects/c-siphash +++ b/subprojects/c-siphash @@ -1 +1 @@ -Subproject commit e01ab640dcf72dfa6928c94a261bf78cd943d9c3 +Subproject commit b24d2e20489b08bb350d67b82f6fb354d6951a1c