diff --git a/include/linux/trie_core.h b/include/linux/trie_core.h new file mode 100644 index 0000000..948a11a --- /dev/null +++ b/include/linux/trie_core.h @@ -0,0 +1,167 @@ +/* + * Lookup based on LC-trie/TRASH + * + * Copyright (C) 2006 Robert Olsson + * Uppsala, Sweden + * + * + * This work uses trie-functions from net/ipv4/fib_trie.c see source + * file for full copyrights and credits. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * References: TRASH. A dynamic LC-trie and hash data structure + * Robert Olsson Uppsala University, Stefan Nilsson KTH. ISSN 1653-7092 + * + */ + +#define VERSION "0.50" + +/* Set keylen in multiples of 32/64/96/128/256 etc + */ +#define LPK 5 /* long per key */ +#define BPL 32 /* bits per long */ +#define KLEN (BPL*LPK) + +#ifdef __KERNEL__ + +#define T_TNODE 0 +#define T_LEAF 1 +#define NODE_TYPE_MASK 0x1UL +#define NODE_PARENT(node) \ + ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) + +#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) + +#define NODE_SET_PARENT(node, ptr) \ + rcu_assign_pointer((node)->parent, \ + ((unsigned long)(ptr)) | NODE_TYPE(node)) + +#define IS_TNODE(n) (!(n->parent & T_LEAF)) +#define IS_LEAF(n) (n->parent & T_LEAF) + +/* Array datatype bit manipulation */ + +/* + * Key numbering. From Left to right, + * 0 - 31 + * 32 - 63 + * 64 - 95 etc + */ + +#define BITPOS(i) ((1<<(BPL-(i&0X1F)-1))) +#define VBIT_IS_SET(key, pos) (key[pos/BPL] & BITPOS(pos)) +#define VBIT_SET(key, pos) ((key[pos/BPL]) |= BITPOS(pos)) +#define VBIT_CLR(key, pos) ((key[pos/BPL]) &= ~(BITPOS(pos))) + +#define HALVE_THRESHOLD_DEFAULT 25 +#define INFLATE_THRESHOLD_DEFAULT 60 + +/* + * For trash special settings for root + * node is not so important + */ + +#define HALVE_THRESHOLD_ROOT_DEFAULT 15 +#define INFLATE_THRESHOLD_ROOT_DEFAULT 25 + +struct node { + __u32 key[LPK]; + unsigned int parent; +}; + +struct leaf { + __u32 key[LPK]; + unsigned int parent; + void *obj; /* Generic leaf object */ +// struct timeval start; + struct rcu_head rcu; +}; + +struct tnode { + __u32 key[LPK]; + unsigned long parent; + unsigned short pos; /* 2log(KEYLENGTH) bits needed */ + unsigned short bits; /* 2log(KEYLENGTH) bits needed */ + unsigned full_children; /* KEYLENGTH bits needed */ + unsigned empty_children; /* KEYLENGTH bits needed */ + struct rcu_head rcu; + struct node *child[0]; +}; + +struct trie { + struct node *trie; +#ifdef CONFIG_TRIE_STATS + struct trie_use_stats stats; +#endif + void *token; + int gc_thresh; + int gc_goal; + int size; + unsigned int revision; + int max_child_bits; + int halve_threshold_root; + int inflate_threshold_root; + int halve_threshold; + int inflate_threshold; +}; + +#define MAX_STAT_DEPTH 32 + +struct trie_stat { + unsigned int totdepth; + unsigned int maxdepth; + unsigned int tnodes; + unsigned int leaves; + unsigned int nullpointers; + unsigned int nodesizes[MAX_STAT_DEPTH]; +}; + +struct trie_ops +{ + int (*gc)(struct trie *); + void (*dump)(struct rtable *); + void (*free)(struct leaf *, struct trie_ops *); +}; + +void trie_init(struct trie *t, int max_child_bits, int gc_thresh, int gc_goal, + int halve_threshold, int inflate_threshold, + int halve_threshold_root, int inflate_threshold_root); + +struct leaf *trie_lookup(struct trie *t, __u32 *key); +struct leaf *trie_insert(struct trie *t, int *pdc, int *err, __u32 *key, struct trie_ops *); +int trie_remove_by_key(struct trie *t, __u32 *key, struct trie_ops *); +void trie_remove(struct trie *t, struct leaf *l, struct trie_ops *); +int trie_flush(struct trie *t, struct trie_ops *); +struct leaf *trie_nextleaf(struct trie *t, struct leaf *thisleaf); +void trie_collect_stats(struct trie *t, struct trie_stat *s); +void __rt_del(struct trie *t, struct rtable *rt, struct trie_ops *ops); + +int __init trie_proc_init(void); +void __init trie_proc_exit(void); + + +static inline void check_tnode(const struct tnode *tn) +{ + WARN_ON(tn && tn->pos+tn->bits > KLEN); +} + +static inline int tnode_child_length(const struct tnode *tn) +{ + return 1 << tn->bits; +} + +#endif /* __KERNEL__ */ diff --git a/net/core/trie_core.c b/net/core/trie_core.c new file mode 100644 index 0000000..21c75dd --- /dev/null +++ b/net/core/trie_core.c @@ -0,0 +1,1348 @@ +/* + * Lookup based on LC-trie/TRASH + * + * Copyright (C) 2006 Robert Olsson + * Uppsala, Sweden + * + * + * This work uses trie-functions from net/ipv4/fib_trie.c see source + * file for full copyrights and credits. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * References: TRASH. A dynamic LC-trie and hash data structure + * Robert Olsson Uppsala University, Stefan Nilsson KTH. ISSN 1653-7092 + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note we can share the most of tnode resize functions + * with fib_trie.c to avoid code duplication. + */ + +#define PRINT(fmt,arg...) \ + printk(KERN_DEBUG fmt,##arg) + +static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); +static struct node *resize(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); +static void tnode_free(struct tnode *tn); + +/* + * TODO: locking per trie/trash meditation + */ + +spinlock_t trie_write_lock = SPIN_LOCK_UNLOCKED; + +static void tkey_copy(__u32 *dest, __u32 *src) +{ + int i; + for(i = 0; i < LPK; i++) + dest[i] = src[i]; + return; +} + +static unsigned int tkey_extract_bits(__u32 *key, int offset, int bits) +{ + unsigned int res = 0; /* For res 2log(KEYLENGTH) bits needed */ + int i; + unsigned max; + unsigned int fi = offset / BPL; + unsigned int li = (offset+bits-1) / BPL; + + + if (offset >= KLEN) + return 0; + + if (fi == li) + return ((key[fi] << (offset&31))) >> (BPL - bits); + + if(offset+bits < KLEN) + max = offset+bits; + else + max = KLEN; + + for(i = offset; i < max; i++) { + res = (res<<1); + + if(VBIT_IS_SET(key, i)) + res |= 1; + } +#if 0 + if (fi == li && res != res2) { + printk(" bits=%d offset=%d key=%X res=%X res2=%X\n", bits, offset, key[li], res, res2); + } +#endif + return res; +} + +static inline int tkey_equals(__u32 *a, __u32 *b) +{ + int i; + for(i = 0; i < LPK; i++) + if( a[i] != b[i] ) + return 0; + return 1; +} + +static inline int tkey_sub_equals(__u32 *a, int offset, int bits, __u32 *b) +{ + int i; + int a1, b1; + + if (bits == 0 || offset >= KLEN) + return 1; + bits = (offset + bits) > KLEN ? (KLEN - offset) : bits; + + for(i = offset; i < offset+bits; i++) { + + a1 = VBIT_IS_SET(a, i); + b1 = VBIT_IS_SET(b, i); + + if( a1 ^ b1) + return 0; + } + return 1; +} + +static int tkey_mismatch(__u32 *a, int offset, __u32 *b) +{ + int i, a1, b1; + + for(i = offset; i < KLEN; i++) { + + a1 = VBIT_IS_SET(a, i); + b1 = VBIT_IS_SET(b, i); + + if( a1 ^ b1) + return i; + } + return i; +} + +static inline struct node *tnode_get_child(struct tnode *tn, int i) +{ + BUG_ON(i >= 1 << tn->bits); + + return rcu_dereference(tn->child[i]); +} + +static void __leaf_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct leaf, rcu)); +} + +static struct tnode *tnode_alloc(unsigned int size) +{ + struct page *pages; + + if (size <= PAGE_SIZE) + return kcalloc(size, 1, GFP_ATOMIC); + + pages = alloc_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); + if (!pages) + return NULL; + + return page_address(pages); +} + +static void __tnode_free_rcu(struct rcu_head *head) +{ + struct tnode *tn = container_of(head, struct tnode, rcu); + unsigned int size = sizeof(struct tnode) + + (1 << tn->bits) * sizeof(struct node *); + + if(!IS_TNODE(tn)) + printk("ERROR not tnode\n"); + + if (size <= PAGE_SIZE) + kfree(tn); + else + free_pages((unsigned long)tn, get_order(size)); +} + +static inline void leaf_free(struct leaf *l, struct trie_ops *ops) +{ + + if(!l) + printk("ERROR LEAF\n"); + + if(ops->free) + ops->free(l, ops); + + call_rcu_bh(&l->rcu, __leaf_free_rcu); +} + +static inline void tnode_free(struct tnode *tn) +{ + if(!tn) printk("ERROR NULLL not tnode\n"); + + if(IS_LEAF(tn)) { + struct leaf *l = (struct leaf *) tn; + printk("ERROR wrong node type=%p \n", l); + + call_rcu_bh(&l->rcu, __leaf_free_rcu); + } + else + call_rcu_bh(&tn->rcu, __tnode_free_rcu); +} + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmalloc(sizeof(struct leaf), GFP_ATOMIC); + + if (l) + l->parent = T_LEAF; + + return l; +} + +static struct tnode* tnode_new(__u32 *key, int pos, int bits) +{ + int nchildren = 1<parent = T_TNODE; + tn->pos = pos; + tn->bits = bits; + tkey_copy(tn->key, key); + tn->full_children = 0; + tn->empty_children = 1<pos == tn->pos + tn->bits; +} + +static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) +{ + tnode_put_child_reorg(tn, i, n, -1); +} + + /* + * Add a child at position i overwriting the old value. + * Update the value of full_children and empty_children. + */ + +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) +{ + struct node *chi = tn->child[i]; + int isfull; + + BUG_ON(i >= 1<bits); + + + /* update emptyChildren */ + if (n == NULL && chi != NULL) + tn->empty_children++; + else if (n != NULL && chi == NULL) + tn->empty_children--; + + /* update fullChildren */ + if (wasfull == -1) + wasfull = tnode_full(tn, chi); + + isfull = tnode_full(tn, n); + if (wasfull && !isfull) + tn->full_children--; + else if (!wasfull && isfull) + tn->full_children++; + + if (n) + NODE_SET_PARENT(n, tn); + + rcu_assign_pointer(tn->child[i], n); +} + +static struct node *resize(struct trie *t, struct tnode *tn) +{ + int i; + int err = 0; + struct tnode *old_tn; + int inflate_threshold_use; + int halve_threshold_use; + int max_resize; + + if (!tn) + return NULL; + +#if 0 + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); +#endif + + /* No children */ + if (tn->empty_children == tnode_child_length(tn)) { + tnode_free(tn); + return NULL; + } + /* One child */ + if (tn->empty_children == tnode_child_length(tn) - 1) + for (i = 0; i < tnode_child_length(tn); i++) { + struct node *n; + + n = tn->child[i]; + if (!n) + continue; + + /* compress one level */ + NODE_SET_PARENT(n, NULL); + tnode_free(tn); + return n; + } + + /* + * Double as long as the resulting node has a number of + * nonempty nodes that are above the threshold. + */ + + /* + * From "Implementing a dynamic compressed trie" by Stefan Nilsson of + * the Helsinki University of Technology and Matti Tikkanen of Nokia + * Telecommunications, page 6: + * "A node is doubled if the ratio of non-empty children to all + * children in the *doubled* node is at least 'high'." + * + * 'high' in this instance is the variable 'inflate_threshold'. It + * is expressed as a percentage, so we multiply it with + * tnode_child_length() and instead of multiplying by 2 (since the + * child array will be doubled by inflate()) and multiplying + * the left-hand side by 100 (to handle the percentage thing) we + * multiply the left-hand side by 50. + * + * The left-hand side may look a bit weird: tnode_child_length(tn) + * - tn->empty_children is of course the number of non-null children + * in the current node. tn->full_children is the number of "full" + * children, that is non-null tnodes with a skip value of 0. + * All of those will be doubled in the resulting inflated tnode, so + * we just count them one extra time here. + * + * A clearer way to write this would be: + * + * to_be_doubled = tn->full_children; + * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * tn->full_children; + * + * new_child_length = tnode_child_length(tn) * 2; + * + * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / + * new_child_length; + * if (new_fill_factor >= inflate_threshold) + * + * ...and so on, tho it would mess up the while () loop. + * + * anyway, + * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= + * inflate_threshold + * + * avoid a division: + * 100 * (not_to_be_doubled + 2*to_be_doubled) >= + * inflate_threshold * new_child_length + * + * expand not_to_be_doubled and to_be_doubled, and shorten: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= inflate_threshold * new_child_length + * + * expand new_child_length: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= + * inflate_threshold * tnode_child_length(tn) * 2 + * + * shorten again: + * 50 * (tn->full_children + tnode_child_length(tn) - + * tn->empty_children) >= inflate_threshold * + * tnode_child_length(tn) + * + */ + + check_tnode(tn); + + /* Keep root node larger */ + + if(!tn->parent) + inflate_threshold_use = t->inflate_threshold_root; + else + inflate_threshold_use = t->inflate_threshold; + + err = 0; + max_resize = 10; + while ((tn->full_children > 0 && + 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= + inflate_threshold_use * tnode_child_length(tn)) && + max_resize-- && (tn->bits < t->max_child_bits) ) { + + old_tn = tn; + tn = inflate(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } + + if(max_resize <= 0) { + + if(!tn->parent) { + PRINT("New inflate_threshold_root=%d size(bits)=%d max_size(bits)=%d\n", + t->inflate_threshold_root, tn->bits, t->max_child_bits); + } + else { + t->inflate_threshold += 1; + PRINT("New inflate_threshold=%d size(bits)=%d\n", t->inflate_threshold, tn->bits); + } + } + + check_tnode(tn); + + /* + * Halve as long as the number of empty children in this + * node is above threshold. + */ + + + /* Keep root node larger */ + + if(!tn->parent) + halve_threshold_use = t->halve_threshold_root; + else + halve_threshold_use = t->halve_threshold; + + err = 0; + max_resize = 10; + while (tn->bits > 1 && + 100 * (tnode_child_length(tn) - tn->empty_children) < + halve_threshold_use * tnode_child_length(tn) && max_resize--) { + + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } + + + if(max_resize < 0) { + + if(!tn->parent) { + t->halve_threshold_root++; + PRINT("New halve_threshold_root=%d size(bits)=%d\n", t->halve_threshold_root, + tn->bits); + } + else { + t->halve_threshold++; + PRINT("New halve_threshold=%d size(bits)=%d\n", t->halve_threshold, + tn->bits); + } + } + + + /* Only one child remains */ + if (tn->empty_children == tnode_child_length(tn) - 1) + for (i = 0; i < tnode_child_length(tn); i++) { + struct node *n; + + n = tn->child[i]; + if (!n) + continue; + + /* compress one level */ + + NODE_SET_PARENT(n, NULL); + tnode_free(tn); + return n; + } + + return (struct node *) tn; +} + +static struct tnode *inflate(struct trie *t, struct tnode *tn) +{ + struct tnode *inode; + struct tnode *oldtnode = tn; + int olen = tnode_child_length(tn); + int i; + + pr_debug("In inflate\n"); + + tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); + + if (!tn) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and inflate + * of tnode is ignored. + */ + + for (i = 0; i < olen; i++) { + struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + + if (inode && + IS_TNODE(inode) && + inode->pos == oldtnode->pos + oldtnode->bits && + inode->bits > 1) { + struct tnode *left, *right; + + left = tnode_new(inode->key, inode->pos + 1, + inode->bits - 1); + if (!left) + goto nomem; + + VBIT_CLR(left->key, inode->pos); + + right = tnode_new(inode->key, inode->pos + 1, + inode->bits - 1); + + if (!right) { + tnode_free(left); + goto nomem; + } + + VBIT_SET(right->key, inode->pos); + + put_child(t, tn, 2*i, (struct node *) left); + put_child(t, tn, 2*i+1, (struct node *) right); + } + } + + for (i = 0; i < olen; i++) { + struct node *node = tnode_get_child(oldtnode, i); + struct tnode *left, *right; + int size, j; + + /* An empty child */ + if (node == NULL) + continue; + + /* A leaf or an internal node with skipped bits */ + + if (IS_LEAF(node) || ((struct tnode *) node)->pos > + tn->pos + tn->bits - 1) { + if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, + 1) == 0) + put_child(t, tn, 2*i, node); + else + put_child(t, tn, 2*i+1, node); + continue; + } + + /* An internal node with two children */ + inode = (struct tnode *) node; + + if (inode->bits == 1) { + put_child(t, tn, 2*i, inode->child[0]); + put_child(t, tn, 2*i+1, inode->child[1]); + + tnode_free(inode); + continue; + } + + /* An internal node with more than two children */ + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ + + /* Use the old key, but set the new significant + * bit to zero. + */ + + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); + + BUG_ON(!left); + + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); + + BUG_ON(!right); + + size = tnode_child_length(left); + for (j = 0; j < size; j++) { + put_child(t, left, j, inode->child[j]); + put_child(t, right, j, inode->child[j + size]); + } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free(inode); + } + tnode_free(oldtnode); + return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for (j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } +} + +static struct tnode *halve(struct trie *t, struct tnode *tn) +{ + struct tnode *oldtnode = tn; + struct node *left, *right; + int i; + int olen = tnode_child_length(tn); + + pr_debug("In halve\n"); + + tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); + + if (!tn) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and halve + * of tnode is ignored. + */ + + for (i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* Two nonempty children */ + if (left && right) { + struct tnode *newn; + + newn = tnode_new(left->key, tn->pos + tn->bits, 1); + + if (!newn) + goto nomem; + + put_child(t, tn, i/2, (struct node *)newn); + } + + } + + for (i = 0; i < olen; i += 2) { + struct tnode *newBinNode; + + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* At least one of the children is empty */ + if (left == NULL) { + if (right == NULL) /* Both are empty */ + continue; + put_child(t, tn, i/2, right); + continue; + } + + if (right == NULL) { + put_child(t, tn, i/2, left); + continue; + } + + /* Two nonempty children */ + newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); + } + tnode_free(oldtnode); + return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for (j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } +} + +static struct node *trie_rebalance(struct trie *t, struct tnode *tn) +{ + int wasfull; + __u32 cindex; + struct tnode *tp = NULL; + + while (tn != NULL && NODE_PARENT(tn) != NULL) { + + tp = NODE_PARENT(tn); + cindex = tkey_extract_bits(tn->key, tp->pos, tp->bits); + wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); + tn = (struct tnode *) resize (t, (struct tnode *)tn); + tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); + + if (!NODE_PARENT(tn)) + break; + + tn = NODE_PARENT(tn); + } + /* Handle last (top) tnode */ + if (IS_TNODE(tn)) + tn = (struct tnode*) resize(t, (struct tnode *)tn); + + return (struct node*) tn; +} + +struct leaf *trie_lookup(struct trie *t, __u32 *key) +{ + int pos; + struct tnode *tn; + struct node *n; + + pos = 0; + n = rcu_dereference(t->trie); + + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + } + + /* Case we have found a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) + return (struct leaf *)n; + + return NULL; +} + +void trie_init(struct trie *t, int max_child_bits, int gc_thresh, int gc_goal, + int halve_threshold, int inflate_threshold, + int halve_threshold_root, int inflate_threshold_root) +{ + if (!t) + return; + + memset(t, 0, sizeof(struct trie)); + t->max_child_bits = max_child_bits; + t->gc_thresh = gc_thresh; + t->gc_goal = gc_goal; + t->halve_threshold = halve_threshold; + t->inflate_threshold = inflate_threshold; + t->halve_threshold_root = halve_threshold_root; + t->inflate_threshold_root = inflate_threshold_root; + +#ifdef CONFIG_TRIE_STATS + memset(&t->stats, 0, sizeof(struct trie_use_stats)); +#endif + PRINT("trie: Version %s Keylengh=%d Bits_Per_long=%d\n", VERSION, KLEN, BPL); +} + + +/* only used from updater-side. trie_write_lock needed */ + +struct leaf *trie_insert(struct trie *t, int *pdc, int *err, __u32 *key, struct trie_ops *ops) +{ + int pos, newpos; + struct tnode *tp = NULL, *tn = NULL; + struct node *n; + struct leaf *l; + int missbit; + __u32 cindex; + + if(ops->gc) + *pdc = ops->gc(t); + + n = rcu_dereference(t->trie); + + /* If we point to NULL, stop. Either the tree is empty and we should + * just put a new leaf in if, or we have reached an empty child slot, + * and we should just put our new leaf in that. + * If we point to a T_TNODE, check if it matches our key. Note that + * a T_TNODE might be skipping any number of bits - its 'pos' need + * not be the parent's 'pos'+'bits'! + * + * If it does match the current key, get pos/bits from it, extract + * the index from our key, push the T_TNODE and walk the tree. + * + * If it doesn't, we have to replace it with a new T_TNODE. + * + * If we point to a T_LEAF, it might or might not have the same key + * as we do. If it does, just change the value, update the T_LEAF's + * value, and return it. + * If it doesn't, we need to replace it with a T_TNODE. + */ + + pos = 0; + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + + check_tnode(tn); + + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { + tp = tn; + pos = tn->pos + tn->bits; + n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + + BUG_ON(n && NODE_PARENT(n) != tn); + } else + break; + } + + /* + * n ----> NULL, LEAF or TNODE + * + * tp is n's (parent) ----> NULL or TNODE + */ + + BUG_ON(tp && IS_LEAF(tp)); + + /* Case 1: n is a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { + + l = (struct leaf *) n; + + *err = -EEXIST; + goto found; + + } + t->size++; + l = leaf_new(); + + if (!l) { + *err = -ENOMEM; + goto err; + } + + tkey_copy( l->key, key); + + if (t->trie && n == NULL) { + /* Case 2: n is NULL, and will just insert a new leaf */ + + NODE_SET_PARENT(l, tp); + + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + } else { + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ + /* + * Add a new tnode here + * first tnode need some special handling + */ + + if (tp) + pos = tp->pos+tp->bits; + else + pos = 0; + + if (n) { + newpos = tkey_mismatch(key, pos, n->key); + tn = tnode_new(n->key, newpos, 1); + } else { + newpos = 0; + tn = tnode_new(key, newpos, 1); /* First tnode */ + } + + if (!tn) { + leaf_free(l, ops); + *err = -ENOMEM; + goto err; + } + + NODE_SET_PARENT(tn, tp); + + missbit = tkey_extract_bits(key, newpos, 1); + put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, 1-missbit, n); + + if (tp) { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); + } else { + rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ + tp = tn; + } + } + + if (tp && tp->pos + tp->bits > KLEN) + PRINT("trie error: tp=%p pos=%d, bits=%d, key=%0x\n", + tp, tp->pos, tp->bits, key[0]); + + /* Rebalance the trie */ + + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); + + t->revision++; + +found: + return l; +err: + return NULL; + +} + +/* trie_write_lock should be held */ + +void trie_remove(struct trie *t, struct leaf *l, struct trie_ops *ops) +{ + __u32 cindex; + struct tnode *tp; + + if(!l) + return; + + pr_debug("entering trie_remove(%p)\n", l); + + /* + * Remove the leaf and rebalance the tree + */ + + t->revision++; + t->size--; + + tp = NODE_PARENT(l); + + if (tp) { + cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, NULL); + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); + } else + rcu_assign_pointer(t->trie, NULL); + + leaf_free(l, ops); +} + +/* trie_write_lock should be held */ + +int __trie_remove_by_key(struct trie *t, __u32 *key, struct trie_ops *tops) +{ + struct node *n = t->trie; + struct leaf *l; + + pr_debug("entering trie_remove_by_key(%p)\n", n); + + /* Note that in the case skipped bits, those bits are *not* checked! + * When we finish this, we will have NULL or a T_LEAF, and the + * T_LEAF may or may not match our key. + */ + + while (n != NULL && IS_TNODE(n)) { + struct tnode *tn = (struct tnode *) n; + check_tnode(tn); + n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); + + BUG_ON(n && NODE_PARENT(n) != tn); + } + + l = (struct leaf *) n; + + if (!n || !tkey_equals(l->key, key)) + return 0; + + trie_remove(t, l, tops); + return 1; +} + +int trie_remove_by_key(struct trie *t, __u32 *key, struct trie_ops *tops) +{ + int res; + spin_lock(&trie_write_lock); + res = __trie_remove_by_key(t, key, tops); + spin_unlock(&trie_write_lock); + return res; +} + +/* rcu_read_lock needs to be hold by caller from readside */ + +struct leaf *trie_nextleaf(struct trie *t, struct leaf *thisleaf) +{ + struct node *c = (struct node *) thisleaf; + struct tnode *p; + int idx; + struct node *trie = rcu_dereference(t->trie); + + if (c == NULL) { + if (trie == NULL) + return NULL; + + if (IS_LEAF(trie)) /* trie w. just a leaf */ + return (struct leaf *) trie; + + p = (struct tnode*) trie; /* Start */ + } else + p = (struct tnode *) NODE_PARENT(c); + + while (p) { + int pos, last; + + /* Find the next child of the parent */ + if (c) + pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); + else + pos = 0; + + last = 1 << p->bits; + for (idx = pos; idx < last ; idx++) { + c = rcu_dereference(p->child[idx]); + + if (!c) + continue; + + /* Decend if tnode */ + while (IS_TNODE(c)) { + p = (struct tnode *) c; + idx = 0; + + /* Rightmost non-NULL branch */ + if (p && IS_TNODE(p)) + while (!(c = rcu_dereference(p->child[idx])) + && idx < (1<bits)) idx++; + + /* Done with this tnode? */ + if (idx >= (1 << p->bits) || !c) + goto up; + } + return (struct leaf *) c; + } +up: + /* No more children go up one step */ + c = (struct node *) p; + p = (struct tnode *) NODE_PARENT(p); + } + return NULL; /* Ready. Root of trie */ +} + +/* Instead of doing a flush a series of removes we take down + * the trie in shot. It's fast. + */ + +/* trie_write_lock needs to be hold */ + +int _trie_flush(struct trie *t, struct trie_ops *ops) +{ + struct node *c = NULL; + int idx; + struct tnode *p = (struct tnode *) rcu_dereference(t->trie); + int i = 0; + + if (p == NULL) + return 0; + + /* trie w. just a leaf */ + + if (IS_LEAF(p)) { + rcu_assign_pointer(t->trie, NULL); + leaf_free((struct leaf *) p, ops); + i++; + return i; + } + +start: + while (p) { + int pos, last; + + if(c) { + /* This child is to be removed + * We get it's pos from parent's + * child array. Nullify it and + * free it. + */ + + pos = tkey_extract_bits(c->key, p->pos, p->bits); + put_child(t, (struct tnode *)p, pos, NULL); + + if( IS_TNODE(c) ) + tnode_free((struct tnode *) c); + + else { + leaf_free((struct leaf *) c, ops); + i++; + t->size--; + } + + /* And moves on to next child */ + pos = pos + 1; + } + else + pos = 0; /* Start */ + + last = 1 << p->bits; + for (idx = pos; idx < last ; idx++) { + c = rcu_dereference(p->child[idx]); + + if (!c) + continue; + + /* Decend if tnode */ + while (IS_TNODE(c)) { + p = (struct tnode *) c; + idx = 0; + + /* Rightmost non-NULL branch */ + if (p && IS_TNODE(p)) + while (!(c = rcu_dereference(p->child[idx])) + && idx < (1<bits)) idx++; + + /* Done with this tnode? */ + if (idx >= (1 << p->bits) || !c) + goto up; + } + p = (struct tnode *) NODE_PARENT(c); + goto start; + } +up: + /* No more children go up one step */ + c = (struct node *) p; + p = (struct tnode *) NODE_PARENT(p); + } + + rcu_assign_pointer(t->trie, NULL); + tnode_free((struct tnode *) c); /* Remove root node */ + + return i; +} + +int trie_flush(struct trie *t, struct trie_ops *ops) +{ + int i; + + /* Can run from both BH and non-BH contexts */ + + spin_lock_bh(&trie_write_lock); + +#ifdef OLDFLUSH + { + struct leaf *ll = NULL, *l = NULL; + + for (i = 0; (l = trie_nextleaf(t, l)) != NULL; i++) { + + if(ll) + trie_remove(t, ll, ops, 0); + ll = l; + } + + trie_remove(t, ll, fops, 0); + } +#else + i = _trie_flush(t, ops); +#endif + t->token = NULL; + + spin_unlock_bh(&trie_write_lock); + + return i; +} + +/* Depth first Trie walk iterator */ +struct trie_iter { + struct tnode *tnode; + struct trie *trie; + unsigned index; + unsigned depth; + int nflows; +}; + +static struct node *trie_get_next(struct trie_iter *iter) +{ + struct tnode *tn = iter->tnode; + unsigned cindex = iter->index; + struct tnode *p; + + /* A single entry routing table */ + if(!tn) + return NULL; + + pr_debug("get_next iter={node=%p index=%d depth=%d}\n", + iter->tnode, iter->index, iter->depth); + +rescan: + while (cindex < (1<bits)) { + struct node *n = tnode_get_child(tn, cindex); + + if (n) { + if (IS_LEAF(n)) { + iter->tnode = tn; + iter->index = cindex + 1; + } else { + /* push down one level */ + iter->tnode = (struct tnode *) n; + iter->index = 0; + ++iter->depth; + } + return n; + } + + ++cindex; + } + + /* Current node exhausted, pop back up */ + p = NODE_PARENT(tn); + if (p) { + cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; + tn = p; + --iter->depth; + goto rescan; + } + + /* got root? */ + return NULL; +} + +static struct node *trie_get_first(struct trie_iter *iter, + struct trie *t) +{ + struct node *n = rcu_dereference(t->trie); + + if (n) { + if (IS_TNODE(n)) { + iter->tnode = (struct tnode *) n; + iter->trie = t; + iter->index = 0; + iter->depth = 1; + } else { + iter->tnode = NULL; + iter->trie = t; + iter->index = 0; + iter->depth = 0; + } + return n; + } + return NULL; +} + +void trie_collect_stats(struct trie *t, struct trie_stat *s) +{ + struct node *n; + struct trie_iter iter; + + memset(s, 0, sizeof(*s)); + + rcu_read_lock_bh(); + for (n = trie_get_first(&iter, t); n; n = trie_get_next(&iter)) { + if (IS_LEAF(n)) { + s->leaves++; + s->totdepth += iter.depth; + if (iter.depth > s->maxdepth) + s->maxdepth = iter.depth; + } else { + const struct tnode *tn = (const struct tnode *) n; + int i; + + s->tnodes++; + if(tn->bits < MAX_STAT_DEPTH) + s->nodesizes[tn->bits]++; + + for (i = 0; i < (1<bits); i++) + if (!tn->child[i]) + s->nullpointers++; + } + } + rcu_read_unlock_bh(); +} + +EXPORT_SYMBOL(trie_insert); +EXPORT_SYMBOL(trie_init); +EXPORT_SYMBOL(trie_remove_by_key); +EXPORT_SYMBOL(trie_lookup); +EXPORT_SYMBOL(trie_flush); +EXPORT_SYMBOL(_trie_flush); +EXPORT_SYMBOL(trie_nextleaf); +EXPORT_SYMBOL(trie_collect_stats); +