bpf lpm trie

lpm有多种实现方式，最常用的是用trie。当然也会有更简单的实现方式，例如某些特定场景用多重哈希表就能解决（ipv4地址，32个掩码对应32个哈希表）

从4.11内核版本开始，bpf map引入了BPF_MAP_TYPE_LPM_TRIE

主要是用于匹配ip地址，内部是将数据存储在一个不平衡的trie中，key使用prefixlen,data

data是以大端网络序存储的，data[0]存的是msb。

prefixlen支持8的整数倍，最高可以是2048。因此除了ip匹配，还可以用来做端口，协议，vpcid等等的扩充匹配。在应用层面上除了做路由表，还可以作为acl，policy等匹配过滤的底层实现

使用方式

BPF_MAP_TYPE_LPM_TRIE — The Linux Kernel documentation

除了上述基本的Ipv4的使用方式，扩展使用方式可以参考一下cillium中IPCACHE_MAP的使用

首先是map的定义

struct ipcache_key {
	struct bpf_lpm_trie_key lpm_key;
	__u16 cluster_id;
	__u8 pad1;
	__u8 family;
	union {
		struct {
			__u32		ip4;
			__u32		pad4;
			__u32		pad5;
			__u32		pad6;
		};
		union v6addr	ip6;
	};
} __packed;

/* Global IP -> Identity map for applying egress label-based policy */
struct {
	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
	__type(key, struct ipcache_key);
	__type(value, struct remote_endpoint_info);
	__uint(pinning, LIBBPF_PIN_BY_NAME);
	__uint(max_entries, IPCACHE_MAP_SIZE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
} IPCACHE_MAP __section_maps_btf;

可以看到cillium将v4和v6合并成一个map查询，匹配条件并带上了cluster_id

因此查询map时候，prefix_len需要加上cluster_id，pad1，family这四个字节的长度。例如查询192.168.10.0/24的时候，prefix_len(bit)= 24bit + 4byte*8bit/byte = 56bit

/* IPCACHE_STATIC_PREFIX gets sizeof non-IP, non-prefix part of ipcache_key */
#define IPCACHE_STATIC_PREFIX							\
	(8 * (sizeof(struct ipcache_key) - sizeof(struct bpf_lpm_trie_key)	\
	      - sizeof(union v6addr)))
#define IPCACHE_PREFIX_LEN(PREFIX) (IPCACHE_STATIC_PREFIX + (PREFIX))

static __always_inline __maybe_unused struct remote_endpoint_info *
ipcache_lookup4(const void *map, __be32 addr, __u32 prefix, __u32 cluster_id)
{
	struct ipcache_key key = {
		.lpm_key = { IPCACHE_PREFIX_LEN(prefix), {} },
		.family = ENDPOINT_KEY_IPV4,
		.ip4 = addr,
	};

	/* Check overflow */
	if (cluster_id > UINT16_MAX)
		return NULL;

	key.cluster_id = (__u16)cluster_id;

	key.ip4 &= GET_PREFIX(prefix);
	return map_lookup_elem(map, &key);
}

cluster_id的字节序是大端还是小端其实没有区别，只要插入和查询都用的相同的字节序就行

基本原理

lpm_trie.c - kernel/bpf/lpm_trie.c - Linux source code v5.13 - Bootlin Elixir Cross Referencer

数据结构

struct lpm_trie_node {
    struct rcu_head rcu;
    struct lpm_trie_node __rcu *child[2];
    u32 prefixlen;  // node的前缀，例如192.168.0.0/24这个node是24
    u32 flags;
    u8 data[];   // data+value，例如192.168.0.0/24 --》123，那么data[5]={c0, a8, 00, 00, 7b}
};
struct lpm_trie {
	struct bpf_map			map;
	struct lpm_trie_node __rcu	*root;
	size_t				n_entries; // 节点数
	size_t				max_prefixlen; //最长前缀bit，例如ipv4是32，上述IPCACHE_MAP的例子是32+32 = 64
	size_t				data_size; // max_prefixlen的byte表示，max_prefixlen/8
	spinlock_t			lock;
};

data 数组的前 trie->data_size 字节存储前缀数据。整个trie的所有节点，前缀数据的大小是固定的
如果节点有 value，则 value 紧跟在前缀数据之后存储。

分配节点

static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, const void *value)
{
    struct lpm_trie_node *node;
    size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

    if (value)
        size += trie->map.value_size;  // 如果有value，分配的空间加上value大小

    node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, trie->map.numa_node);
    if (!node)
        return NULL;

    node->flags = 0;

    if (value)
        memcpy(node->data + trie->data_size, value, trie->map.value_size);  // 如果有value，将其复制到 data 数组中前缀数据之后的位置。

    return node;
}

插入流程

/* This trie implements a longest prefix match algorithm that can be used to
 * match IP addresses to a stored set of ranges.
 *
 * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
 * interpreted as big endian, so data[0] stores the most significant byte.
 *
 * Match ranges are internally stored in instances of struct lpm_trie_node
 * which each contain their prefix length as well as two pointers that may
 * lead to more nodes containing more specific matches. Each node also stores
 * a value that is defined by and returned to userspace via the update_elem
 * and lookup functions.
 *
 * For instance, let's start with a trie that was created with a prefix length
 * of 32, so it can be used for IPv4 addresses, and one single element that
 * matches 192.168.0.0/16. The data array would hence contain
 * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
 * stick to IP-address notation for readability though.
 *
 * As the trie is empty initially, the new node (1) will be places as root
 * node, denoted as (R) in the example below. As there are no other node, both
 * child pointers are %NULL.
 *
 *              +----------------+
 *              |       (1)  (R) |
 *              | 192.168.0.0/16 |
 *              |    value: 1    |
 *              |   [0]    [1]   |
 *              +----------------+
 *
 * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
 * a node with the same data and a smaller prefix (ie, a less specific one),
 * node (2) will become a child of (1). In child index depends on the next bit
 * that is outside of what (1) matches, and that bit is 0, so (2) will be
 * child[0] of (1):
 
 * 由于已经存在一个data相同但长度更短的前缀，新前缀将作为节点(1)的子节点存储。新前缀（192.168.0.0/24）在旧前缀（192.168.0.0/16）之后的下一位（第17位） 为0，因此将成为孩子0
 *
 *              +----------------+
 *              |       (1)  (R) |
 *              | 192.168.0.0/16 |
 *              |    value: 1    |
 *              |   [0]    [1]   |
 *              +----------------+
 *                   |
 *    +----------------+
 *    |       (2)      |
 *    | 192.168.0.0/24 |
 *    |    value: 2    |
 *    |   [0]    [1]   |
 *    +----------------+
 *
 * The child[1] slot of (1) could be filled with another node which has bit #17
 * (the next bit after the ones that (1) matches on) set to 1. For instance,
 * 192.168.128.0/24:
 * 第17位为1，所以插入在右边
 *
 *              +----------------+
 *              |       (1)  (R) |
 *              | 192.168.0.0/16 |
 *              |    value: 1    |
 *              |   [0]    [1]   |
 *              +----------------+
 *                   |      |
 *    +----------------+  +------------------+
 *    |       (2)      |  |        (3)       |
 *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
 *    |    value: 2    |  |     value: 3     |
 *    |   [0]    [1]   |  |    [0]    [1]    |
 *    +----------------+  +------------------+
 *
 * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
 * it, node (1) is looked at first, and because (4) of the semantics laid out
 * above (bit #17 is 0), it would normally be attached to (1) as child[0].
 * 本来要插入在（1）的左孩子，但是这个位置已经被（2）占掉了，所以要创造出这样的一个空间，也就是（2）的prefixlen 24往前移一位prefixlen 23，作为假节点
 * However, that slot is already allocated, so a new node is needed in between.
 * That node does not have a value attached to it and it will never be
 * returned to users as result of a lookup. It is only there to differentiate
 * the traversal further. It will get a prefix as wide as necessary to
 * distinguish its two children:
 *
 *                      +----------------+
 *                      |       (1)  (R) |
 *                      | 192.168.0.0/16 |
 *                      |    value: 1    |
 *                      |   [0]    [1]   |
 *                      +----------------+
 *                           |      |
 *            +----------------+  +------------------+
 *            |       (4)  (I) |  |        (3)       |
 *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
 *            |    value: ---  |  |     value: 3     |
 *            |   [0]    [1]   |  |    [0]    [1]    |
 *            +----------------+  +------------------+
 *                 |      |
 *  +----------------+  +----------------+
 *  |       (2)      |  |       (5)      |
 *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
 *  |    value: 2    |  |     value: 5   |
 *  |   [0]    [1]   |  |   [0]    [1]   |
 *  +----------------+  +----------------+
 *
 * 192.168.1.1/32 would be a child of (5) etc.
 *
 * An intermediate node will be turned into a 'real' node on demand. In the
 * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
 *
 * A fully populated trie would have a height of 32 nodes, as the trie was
 * created with a prefix length of 32.
 *
 * The lookup starts at the root node. If the current node matches and if there
 * is a child that can be used to become more specific, the trie is traversed
 * downwards. The last node in the traversal that is a non-intermediate one is
 * returned.
 */


static size_t longest_prefix_match(const struct lpm_trie *trie,
				   const struct lpm_trie_node *node,
				   const struct bpf_lpm_trie_key *key)
{
	u32 limit = min(node->prefixlen, key->prefixlen);  // 匹配到两者的最小就可以停止了，例如/24和/16，只要匹配到/16就可以确定已经匹配上了
	u32 prefixlen = 0, i = 0;

	BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); // 确保数据四字节对齐
	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)  // 如果64位系统支持不对齐读取，则先处理开头的64bit

	/* data_size >= 16 has very small probability.
	 * We do not use a loop for optimal code generation.
	 */
	if (trie->data_size >= 8) {
		u64 diff = be64_to_cpu(*(__be64 *)node->data ^
				       *(__be64 *)key->data);

		prefixlen = 64 - fls64(diff);
		if (prefixlen >= limit)
			return limit;
		if (diff)
			return prefixlen;
		i = 8;
	}
#endif
	// 循环4个字节4个字节去匹配
	while (trie->data_size >= i + 4) {
		u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
				       *(__be32 *)&key->data[i]); // 异或，不匹配的bit为1

		prefixlen += 32 - fls(diff);   // Find Last Set，返回一个整数的最高有效位的位置（从1开始计数），取最高的不匹配的位置
		if (prefixlen >= limit)
			return limit; // 若超过limit，说明完全匹配上，直接返回limit
		if (diff)
			return prefixlen;  // 发现不匹配的位置，则当前的位置就是最长匹配前缀的位置
		i += 4; // 若没有diff，说明这四个byte都匹配上了，继续
	}

    // 如果trie->data_size不是8的倍数，需要再处理末尾的1，2，3个字节
	if (trie->data_size >= i + 2) {
		u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
				       *(__be16 *)&key->data[i]);

		prefixlen += 16 - fls(diff);
		if (prefixlen >= limit)
			return limit;
		if (diff)
			return prefixlen;
		i += 2;
	}

	if (trie->data_size >= i + 1) {
		prefixlen += 8 - fls(node->data[i] ^ key->data[i]);

		if (prefixlen >= limit)
			return limit;
	}

	return prefixlen;
}


/* Called from syscall or from eBPF program */
static int trie_update_elem(struct bpf_map *map,
			    void *_key, void *value, u64 flags)
{
	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
	struct lpm_trie_node __rcu **slot;
	struct bpf_lpm_trie_key *key = _key;
	unsigned long irq_flags;
	unsigned int next_bit;
	size_t matchlen = 0;
	int ret = 0;

	if (unlikely(flags > BPF_EXIST))
		return -EINVAL;

	if (key->prefixlen > trie->max_prefixlen) //最大匹配长度校验
		return -EINVAL;

	spin_lock_irqsave(&trie->lock, irq_flags);

	/* Allocate and fill a new node */

	if (trie->n_entries == trie->map.max_entries) {  // 最大node容量校验
		ret = -ENOSPC;
		goto out;
	}

	new_node = lpm_trie_node_alloc(trie, value); // 分配新node
	if (!new_node) {
		ret = -ENOMEM;
		goto out;
	}

	trie->n_entries++;   // node计数增加

	new_node->prefixlen = key->prefixlen;
	RCU_INIT_POINTER(new_node->child[0], NULL);  // 初始化child指针
	RCU_INIT_POINTER(new_node->child[1], NULL);
	memcpy(new_node->data, key->data, trie->data_size);

	/* Now find a slot to attach the new node. To do that, walk the tree
	 * from the root and match as many bits as possible for each node until
	 * we either find an empty slot or a slot that needs to be replaced by
	 * an intermediate node.
	 */
	slot = &trie->root;

	while ((node = rcu_dereference_protected(*slot,
					lockdep_is_held(&trie->lock)))) {  //如果slot 非空，则将其值赋给 node，并进入循环体。
		matchlen = longest_prefix_match(trie, node, key);  // 匹配的长度

		if (node->prefixlen != matchlen || // 这边不相等的情况下matchlen一定比node->prefixlen小，也就是说key没有完全匹配上node，此时找到位置了，需要插在node上面
		    node->prefixlen == key->prefixlen ||  // 找到prefix完全匹配的节点，应该在这一层插入
		    node->prefixlen == trie->max_prefixlen) // node已经是trie最深的叶子节点了
			break;

		next_bit = extract_bit(key->data, node->prefixlen);  //从目标键 key 的数据中提取当前前缀长度位置的位，存储在 next_bit
		slot = &node->child[next_bit]; //slot是当前处理的指针，根据next_bit往下遍历
	}

	/* If the slot is empty (a free child pointer or an empty root),
	 * simply assign the @new_node to that slot and be done.
	 */
	if (!node) {
		rcu_assign_pointer(*slot, new_node); // 空位置则直接插入
		goto out;
	}

	/* If the slot we picked already exists, replace it with @new_node
	 * which already has the correct data array set.
	 
	 *   下面这种情况，matchlen == 24, key->prefixlen = 24，其实就是节点的替换，刷新value
     *   +----------------+           +----------------+   
     *   |     new_node   |           |       (1)      |
     *   | 192.168.0.0/24 |           | 192.168.0.0/16 |
     *   |    value: 3    |           |    value: 1    |
     *   |   [0]    [1]   |           |   [0]    [1]   |
     *   +----------------+           +----------------+  
     *            |                           |   
     			  +-------------------------->|
                     	             +----------------+ 
                     	             |       (2) node |  
                  	                 | 192.168.0.0/24 | 
                	                 |    value: 2    | 
                 	                 |   [0]    [1]   |  
                 	                 +----------------+ 
	 */
    
    
	if (node->prefixlen == matchlen) {
		new_node->child[0] = node->child[0];
		new_node->child[1] = node->child[1];

		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
			trie->n_entries--;

		rcu_assign_pointer(*slot, new_node);
		kfree_rcu(node, rcu);

		goto out;
	}

	/* If the new node matches the prefix completely, it must be inserted
	 * as an ancestor. Simply insert it between @node and *@slot.
	 
	 *   下面这种情况，matchlen == 17, key->prefixlen == 17，node->prefixlen > matchlen, 需要在node上面插入
     *   +----------------+           +----------------+   
     *   |       (3)  key |           |       (1)      |
     *   | 192.168.0.0/17 |           | 192.168.0.0/16 |
     *   |    value: 1    |           |    value: 1    |
     *   |   [0]    [1]   |           |   [0]    [1]   |
     *   +----------------+           +----------------+  
     *            |                           |   
     			  +-------------------------->|
                     	             +----------------+ 
                     	             |       (2) node |  
                  	                 | 192.168.0.0/24 | 
                	                 |    value: 2    | 
                 	                 |   [0]    [1]   |  
                 	                 +----------------+ 
 
	 */
	if (matchlen == key->prefixlen) {
		next_bit = extract_bit(node->data, matchlen);   // 上图例子：17bit是0， 所以是左孩子
		rcu_assign_pointer(new_node->child[next_bit], node); // node作为new_node的child
		rcu_assign_pointer(*slot, new_node);
		goto out;
	}

    /*
     *   下面这种情况，matchlen == 23, key->prefixlen == 24，node->prefixlen > matchlen, 需要插imnode，matchlen作为imnode的prefixlen
     *   +----------------+           +----------------+   
     *   |       (3)  new |           |       (1)      |
     *   | 192.168.1.0/24 |           | 192.168.0.0/16 |
     *   |    value: 1    |           |    value: 1    |
     *   |   [0]    [1]   |           |   [0]    [1]   |
     *   +----------------+           +----------------+  
     *            |                           |   
     			  +-------------------------->|
                     	             +----------------+ 
                     	             |       (2) node |  
                  	                 | 192.168.0.0/24 | 
                	                 |    value: 2    | 
                 	                 |   [0]    [1]   |  
                 	                 +----------------+ 
     
     						    ||
     							||
     							V
                 	                 
     *                      +----------------+
     *                      |       (1)  (R) |
     *                      | 192.168.0.0/16 |
     *                      |    value: 1    |
     *                      |   [0]    [1]   |
     *                      +----------------+
     *                           |      
     *            +----------------+ 
     *            |       (4)  (I) |
     *            | 192.168.0.0/23 | 
     *            |    value: ---  |  
     *            |   [0]    [1]   |  
     *            +----------------+ 
     *                 |      |
     *  +----------------+  +----------------+
     *  |       (2)      |  |  (5) new, key  |
     *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
     *  |    value: 2    |  |     value: 5   |
     *  |   [0]    [1]   |  |   [0]    [1]   |
     *  +----------------+  +----------------+
 
	 */
    
	im_node = lpm_trie_node_alloc(trie, NULL); // 假node，value是空
	if (!im_node) {
		ret = -ENOMEM;
		goto out;
	}

	im_node->prefixlen = matchlen;  // matchlen作为imnode的prefixlen
	im_node->flags |= LPM_TREE_NODE_FLAG_IM; // node标记
	memcpy(im_node->data, node->data, trie->data_size);

	/* Now determine which child to install in which slot */
	if (extract_bit(key->data, matchlen)) {   // key是newnode，所以如果1，说明new_node在im_node右孩子
		rcu_assign_pointer(im_node->child[0], node);
		rcu_assign_pointer(im_node->child[1], new_node);
	} else {
		rcu_assign_pointer(im_node->child[0], new_node);
		rcu_assign_pointer(im_node->child[1], node);
	}

	/* Finally, assign the intermediate node to the determined spot */
	rcu_assign_pointer(*slot, im_node);

out:
	if (ret) {
		if (new_node)
			trie->n_entries--;

		kfree(new_node);
		kfree(im_node);
	}

	spin_unlock_irqrestore(&trie->lock, irq_flags);

	return ret;
}

查询流程

/* Called from syscall or from eBPF program */
static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
	struct lpm_trie_node *node, *found = NULL;
	struct bpf_lpm_trie_key *key = _key;

	/* Start walking the trie from the root node ... */

	for (node = rcu_dereference(trie->root); node;) {
		unsigned int next_bit;
		size_t matchlen;

		/* Determine the longest prefix of @node that matches @key.
		 * If it's the maximum possible prefix for this trie, we have
		 * an exact match and can return it directly.
		 */
		matchlen = longest_prefix_match(trie, node, key);
		if (matchlen == trie->max_prefixlen) { // 到底了，直接返回
			found = node;
			break;
		}

		/* If the number of bits that match is smaller than the prefix
		 * length of @node, bail out and return the node we have seen
		 * last in the traversal (ie, the parent).
		 */
		if (matchlen < node->prefixlen) // 没有完全匹配，那就找到最长匹配了，但是是上一个
			break;

		/* Consider this node as return candidate unless it is an
		 * artificially added intermediate one.
		 */
		if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) // 不是假node才算找到
			found = node;

		/* If the node match is fully satisfied, let's see if we can
		 * become more specific. Determine the next bit in the key and
		 * traverse down.
		 */
		next_bit = extract_bit(key->data, node->prefixlen); //完全匹配了，那就继续往下走继续匹配
		node = rcu_dereference(node->child[next_bit]);
	}

	if (!found)
		return NULL;

	return found->data + trie->data_size;  // value在data_size后面
}

windseek

be curious, be free