From 6fcc970ae7e31f44fa6b6aface983cadb023cc50 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 17 Nov 2022 16:10:44 +0700 Subject: [PATCH v901 2/2] Make node32 variable sized Add a size class for 15 elements, which corresponds to 160 bytes, an allocation size used by DSA. When a 16th element is to be inserted, allocte a larger area and memcpy the entire old node to it. NB: Zeroing the new area is only necessary if it's for an inner node128, since insert logic must check for null child pointers. This technique allows us to limit the node kinds to 4, which 1. limits the number of cases in switch statements 2. allows a possible future optimization to encode the node kind in a pointer tag --- src/backend/lib/radixtree.c | 141 +++++++++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 33 deletions(-) diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c index bef1a438ab..f368e750d5 100644 --- a/src/backend/lib/radixtree.c +++ b/src/backend/lib/radixtree.c @@ -130,6 +130,7 @@ typedef enum typedef enum rt_size_class { RT_CLASS_4_FULL = 0, + RT_CLASS_32_PARTIAL, RT_CLASS_32_FULL, RT_CLASS_128_FULL, RT_CLASS_256 @@ -147,6 +148,8 @@ typedef struct rt_node uint16 count; /* Max number of children. We can use uint8 because we never need to store 256 */ + /* WIP: if we don't have a variable sized node4, this should instead be in the base + types as needed, since saving every byte is crucial for the smallest node kind */ uint8 fanout; /* @@ -166,6 +169,8 @@ typedef struct rt_node ((node)->base.n.count < (node)->base.n.fanout) /* Base type of each node kinds for leaf and inner nodes */ +/* The base types must be a be able to accommodate the largest size +class for variable-sized node kinds*/ typedef struct rt_node_base_4 { rt_node n; @@ -217,40 +222,40 @@ typedef struct rt_node_inner_4 { rt_node_base_4 base; - /* 4 children, for key chunks */ - rt_node *children[4]; + /* number of children depends on size class */ + rt_node *children[FLEXIBLE_ARRAY_MEMBER]; } rt_node_inner_4; typedef struct rt_node_leaf_4 { rt_node_base_4 base; - /* 4 values, for key chunks */ - uint64 values[4]; + /* number of values depends on size class */ + uint64 values[FLEXIBLE_ARRAY_MEMBER]; } rt_node_leaf_4; typedef struct rt_node_inner_32 { rt_node_base_32 base; - /* 32 children, for key chunks */ - rt_node *children[32]; + /* number of children depends on size class */ + rt_node *children[FLEXIBLE_ARRAY_MEMBER]; } rt_node_inner_32; typedef struct rt_node_leaf_32 { rt_node_base_32 base; - /* 32 values, for key chunks */ - uint64 values[32]; + /* number of values depends on size class */ + uint64 values[FLEXIBLE_ARRAY_MEMBER]; } rt_node_leaf_32; typedef struct rt_node_inner_128 { rt_node_base_128 base; - /* Slots for 128 children */ - rt_node *children[128]; + /* number of children depends on size class */ + rt_node *children[FLEXIBLE_ARRAY_MEMBER]; } rt_node_inner_128; typedef struct rt_node_leaf_128 @@ -260,8 +265,8 @@ typedef struct rt_node_leaf_128 /* isset is a bitmap to track which slot is in use */ uint8 isset[RT_NODE_NSLOTS_BITS(128)]; - /* Slots for 128 values */ - uint64 values[128]; + /* number of values depends on size class */ + uint64 values[FLEXIBLE_ARRAY_MEMBER]; } rt_node_leaf_128; /* @@ -307,32 +312,40 @@ typedef struct rt_size_class_elem * from the block. */ #define NODE_SLAB_BLOCK_SIZE(size) \ - Max((SLAB_DEFAULT_BLOCK_SIZE / (size)) * size, (size) * 32) + Max((SLAB_DEFAULT_BLOCK_SIZE / (size)) * (size), (size) * 32) static rt_size_class_elem rt_size_class_info[RT_SIZE_CLASS_COUNT] = { [RT_CLASS_4_FULL] = { .name = "radix tree node 4", .fanout = 4, - .inner_size = sizeof(rt_node_inner_4), - .leaf_size = sizeof(rt_node_leaf_4), - .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_4)), - .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_4)), + .inner_size = sizeof(rt_node_inner_4) + 4 * sizeof(rt_node *), + .leaf_size = sizeof(rt_node_leaf_4) + 4 * sizeof(uint64), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_4) + 4 * sizeof(rt_node *)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_4) + 4 * sizeof(uint64)), + }, + [RT_CLASS_32_PARTIAL] = { + .name = "radix tree node 15", + .fanout = 15, + .inner_size = sizeof(rt_node_inner_32) + 15 * sizeof(rt_node *), + .leaf_size = sizeof(rt_node_leaf_32) + 15 * sizeof(uint64), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_32) + 15 * sizeof(rt_node *)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_32) + 15 * sizeof(uint64)), }, [RT_CLASS_32_FULL] = { .name = "radix tree node 32", .fanout = 32, - .inner_size = sizeof(rt_node_inner_32), - .leaf_size = sizeof(rt_node_leaf_32), - .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_32)), - .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_32)), + .inner_size = sizeof(rt_node_inner_32) + 32 * sizeof(rt_node *), + .leaf_size = sizeof(rt_node_leaf_32) + 32 * sizeof(uint64), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_32) + 32 * sizeof(rt_node *)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_32) + 32 * sizeof(uint64)), }, [RT_CLASS_128_FULL] = { .name = "radix tree node 128", .fanout = 128, - .inner_size = sizeof(rt_node_inner_128), - .leaf_size = sizeof(rt_node_leaf_128), - .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_128)), - .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_128)), + .inner_size = sizeof(rt_node_inner_128) + 128 * sizeof(rt_node *), + .leaf_size = sizeof(rt_node_leaf_128) + 128 * sizeof(uint64), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_128) + 128 * sizeof(rt_node *)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_128) + 128 * sizeof(uint64)), }, [RT_CLASS_256] = { .name = "radix tree node 256", @@ -922,7 +935,6 @@ rt_free_node(radix_tree *tree, rt_node *node) #ifdef RT_DEBUG /* update the statistics */ - // FIXME for (i = 0; i < RT_SIZE_CLASS_COUNT; i++) { if (node->fanout == rt_size_class_info[i].fanout) @@ -1240,7 +1252,7 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke /* grow node from 4 to 32 */ new32 = (rt_node_inner_32 *) rt_copy_node(tree, (rt_node *) n4, - RT_NODE_KIND_32, RT_CLASS_32_FULL); + RT_NODE_KIND_32, RT_CLASS_32_PARTIAL); chunk_children_array_copy(n4->base.chunks, n4->children, new32->base.chunks, new32->children, n4->base.n.count); @@ -1282,6 +1294,37 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke if (unlikely(!NODE_HAS_FREE_SLOT(n32))) { + Assert(parent != NULL); + + if (n32->base.n.fanout == rt_size_class_info[RT_CLASS_32_PARTIAL].fanout) + { + /* use the same node kind, but expand to the next size class */ + + /* no need to zero the new memory */ + rt_node_inner_32 *new32 = + (rt_node_inner_32 *) MemoryContextAlloc(tree->inner_slabs[RT_CLASS_32_FULL], + rt_size_class_info[RT_CLASS_32_FULL].inner_size); + +// FIXME the API for rt_alloc_node and rt_node_copy are too entangled +#ifdef RT_DEBUG + /* update the statistics */ + tree->cnt[RT_CLASS_32_FULL]++; +#endif + /* copy the entire old node -- the new node is only different in having + additional slots so we only have to change the fanout */ + memcpy(new32, n32, rt_size_class_info[RT_CLASS_32_PARTIAL].inner_size); + new32->base.n.fanout = rt_size_class_info[RT_CLASS_32_FULL].fanout; + + rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new32, + key); + + /* must update both pointers here */ + node = (rt_node *) new32; + n32 = new32; + goto retry_insert_inner_32; + } + else + { rt_node_inner_128 *new128; /* grow node from 32 to 128 */ @@ -1290,13 +1333,14 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke for (int i = 0; i < n32->base.n.count; i++) node_inner_128_insert(new128, n32->base.chunks[i], n32->children[i]); - Assert(parent != NULL); rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, key); node = (rt_node *) new128; + } } else { +retry_insert_inner_32: int insertpos = node_32_get_insertpos((rt_node_base_32 *) n32, chunk); int16 count = n32->base.n.count; @@ -1409,12 +1453,10 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, /* grow node from 4 to 32 */ new32 = (rt_node_leaf_32 *) rt_copy_node(tree, (rt_node *) n4, - RT_NODE_KIND_32, RT_CLASS_32_FULL); + RT_NODE_KIND_32, RT_CLASS_32_PARTIAL); chunk_values_array_copy(n4->base.chunks, n4->values, new32->base.chunks, new32->values, n4->base.n.count); - - Assert(parent != NULL); rt_replace_node(tree, parent, (rt_node *) n4, (rt_node *) new32, key); node = (rt_node *) new32; @@ -1451,6 +1493,37 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, if (unlikely(!NODE_HAS_FREE_SLOT(n32))) { + Assert(parent != NULL); + + if (n32->base.n.fanout == rt_size_class_info[RT_CLASS_32_PARTIAL].fanout) + { + /* use the same node kind, but expand to the next size class */ + + /* no need to zero the new memory */ + rt_node_leaf_32 *new32 = + (rt_node_leaf_32 *) MemoryContextAlloc(tree->leaf_slabs[RT_CLASS_32_FULL], + rt_size_class_info[RT_CLASS_32_FULL].leaf_size); + +// FIXME the API for rt_alloc_node and rt_node_copy are too entangled +#ifdef RT_DEBUG + /* update the statistics */ + tree->cnt[RT_CLASS_32_FULL]++; +#endif + /* copy the entire old node -- the new node is only different in having + additional slots so we only have to change the fanout */ + memcpy(new32, n32, rt_size_class_info[RT_CLASS_32_PARTIAL].leaf_size); + new32->base.n.fanout = rt_size_class_info[RT_CLASS_32_FULL].fanout; + + rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new32, + key); + + /* must update both pointers here */ + node = (rt_node *) new32; + n32 = new32; + goto retry_insert_leaf_32; + } + else + { rt_node_leaf_128 *new128; /* grow node from 32 to 128 */ @@ -1459,13 +1532,14 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, for (int i = 0; i < n32->base.n.count; i++) node_leaf_128_insert(new128, n32->base.chunks[i], n32->values[i]); - Assert(parent != NULL); rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, key); node = (rt_node *) new128; + } } else { +retry_insert_leaf_32: int insertpos = node_32_get_insertpos((rt_node_base_32 *) n32, chunk); int count = n32->base.n.count; @@ -2189,10 +2263,11 @@ rt_verify_node(rt_node *node) void rt_stats(radix_tree *tree) { - ereport(LOG, (errmsg("num_keys = %lu, height = %u, n4 = %u, n32 = %u, n128 = %u, n256 = %u", + ereport(NOTICE, (errmsg("num_keys = %lu, height = %u, n4 = %u, n15 = %u, n32 = %u, n128 = %u, n256 = %u", tree->num_keys, tree->root->shift / RT_NODE_SPAN, tree->cnt[RT_CLASS_4_FULL], + tree->cnt[RT_CLASS_32_PARTIAL], tree->cnt[RT_CLASS_32_FULL], tree->cnt[RT_CLASS_128_FULL], tree->cnt[RT_CLASS_256]))); -- 2.38.1