-- a test table - just two integer columns -- we'll fill it with data, collect stats and see what is the estimated / actual number of rows CREATE TABLE test_table ( col_a INT, col_b INT ); -- just to speed up the stats collection CREATE INDEX test_table_a_idx ON test_table(col_a); CREATE INDEX test_table_b_idx ON test_table(col_b); -- table used to store statistics CREATE TABLE cross_stats ( histogram_a INT[], histogram_b INT[], cross_histogram INT[] ); /* * Collects statistics. * * This is a very stupid / slow implementation. */ CREATE OR REPLACE FUNCTION collect_stats(p_bins_a INT, p_bins_b INT) RETURNS void AS $$ DECLARE v_value INT; v_count INT; v_histogram_a INT[]; v_histogram_b INT[]; v_contingency_table INT[]; v_row RECORD; v_bin_idx INT; BEGIN -- count all rows SELECT count(*) INTO v_count FROM test_table; /* build histogram s */ -- lower borders SELECT MIN(col_a) INTO v_value FROM test_table; v_histogram_a := array_append(v_histogram_a, v_value); SELECT MIN(col_b) INTO v_value FROM test_table; v_histogram_b := array_append(v_histogram_b, v_value); -- inner borders FOR v_idx IN 1..(p_bins_a-1) LOOP SELECT col_a INTO v_value FROM test_table ORDER BY col_a LIMIT 1 OFFSET floor(v_idx * v_count / p_bins_a); v_histogram_a := array_append(v_histogram_a, v_value); END LOOP; FOR v_idx IN 1..(p_bins_b-1) LOOP SELECT col_b INTO v_value FROM test_table ORDER BY col_b LIMIT 1 OFFSET floor(v_idx * v_count / p_bins_b); v_histogram_b := array_append(v_histogram_b, v_value); END LOOP; -- upper borders SELECT MAX(col_a) INTO v_value FROM test_table; v_histogram_a := array_append(v_histogram_a, v_value); SELECT MAX(col_b) INTO v_value FROM test_table; v_histogram_b := array_append(v_histogram_b, v_value); /* build the contingency table */ -- init FOR v_idx_a IN 1..(p_bins_a*p_bins_b) LOOP v_contingency_table := array_append(v_contingency_table, 0); END LOOP; FOR v_row IN (SELECT * FROM test_table) LOOP v_bin_idx := get_contingency_bin(v_histogram_a, v_histogram_b, v_row.col_a, v_row.col_b); v_contingency_table[v_bin_idx] := v_contingency_table[v_bin_idx] + 1; END LOOP; -- save stats DELETE FROM cross_stats; INSERT INTO cross_stats VALUES (v_histogram_a, v_histogram_b, v_contingency_table); END; $$ LANGUAGE plpgsql; -- get ID of the bin in a (linearized) contingency table CREATE OR REPLACE FUNCTION get_contingency_bin(p_histogram_a INT[], p_histogram_b INT[], p_value_a INT, p_value_b INT) RETURNS INT AS $$ DECLARE v_idx_a INT; v_idx_b INT; BEGIN v_idx_a := get_histogram_bin(p_histogram_a, p_value_a); v_idx_b := get_histogram_bin(p_histogram_b, p_value_b); RETURN (v_idx_b - 1) * (array_upper(p_histogram_a,1)-1) + v_idx_a; END; $$ LANGUAGE plpgsql; -- get bin in a histogram CREATE OR REPLACE FUNCTION get_histogram_bin(p_histogram INT[], p_value INT) RETURNS INT AS $$ DECLARE v_tmp INT; BEGIN -- slow, bisection should be used ... FOR v_idx IN 1..(array_upper(p_histogram,1)-1) LOOP IF (p_value >= p_histogram[v_idx]) THEN v_tmp := v_idx; END IF; END LOOP; RETURN v_tmp; END; $$ LANGUAGE plpgsql; -- compute the estimate when there are range conditions on both columns, i.e. something like -- ... WHERE (col_a BETWEEN 40 AND 75) AND (col_b BETWEEN 75 AND 1293) CREATE OR REPLACE FUNCTION get_estimate(p_from_a INT, p_to_a INT, p_from_b INT, p_to_b INT) RETURNS INT AS $$ DECLARE -- bin indexes for col_a v_from_a_bin INT; v_to_a_bin INT; -- bin indexes for col_b v_from_b_bin INT; v_to_b_bin INT; -- the estimate v_estimate INT := 0; -- histograms (loaded from cross_stats) v_histogram_a INT[]; v_histogram_b INT[]; v_contingency INT[]; v_cont_idx INT; -- coefficients (used to compute area of a single bin) v_coeff_a FLOAT; v_coeff_b FLOAT; BEGIN SELECT histogram_a INTO v_histogram_a FROM cross_stats; SELECT histogram_b INTO v_histogram_b FROM cross_stats; SELECT cross_histogram INTO v_contingency FROM cross_stats; v_from_a_bin := get_histogram_bin(v_histogram_a, p_from_a); v_to_a_bin := get_histogram_bin(v_histogram_a, p_to_a); v_from_b_bin := get_histogram_bin(v_histogram_b, p_from_b); v_to_b_bin := get_histogram_bin(v_histogram_b, p_to_b); FOR v_idx_a IN v_from_a_bin..v_to_a_bin LOOP IF (v_from_a_bin = v_to_a_bin) THEN -- single bin v_coeff_a := (p_to_a - p_from_a)::float / (v_histogram_a[v_from_a_bin+1] - v_histogram_a[v_from_a_bin]); ELSIF (v_idx_a = v_from_a_bin) THEN -- starting bin v_coeff_a := (v_histogram_a[v_from_a_bin+1] - p_from_a)::float / (v_histogram_a[v_from_a_bin+1] - v_histogram_a[v_from_a_bin]); ELSIF (v_idx_a = v_to_a_bin) THEN -- last bin v_coeff_a := (p_to_a - v_histogram_a[v_to_a_bin])::float / (v_histogram_a[v_to_a_bin+1] - v_histogram_a[v_to_a_bin]); ELSE -- inner bins v_coeff_a := 1; END IF; FOR v_idx_b IN v_from_b_bin..v_to_b_bin LOOP IF (v_from_b_bin = v_to_b_bin) THEN -- single bin v_coeff_b := (p_to_b - p_from_b)::float / (v_histogram_b[v_from_b_bin+1] - v_histogram_b[v_from_b_bin]); ELSIF (v_idx_b = v_from_b_bin) THEN -- starting bin v_coeff_b := (v_histogram_b[v_from_b_bin+1] - p_from_b)::float / (v_histogram_b[v_from_b_bin+1] - v_histogram_b[v_from_b_bin]); ELSIF (v_idx_a = v_to_a_bin) THEN -- last bin v_coeff_b := (p_to_b - v_histogram_a[v_to_b_bin])::float / (v_histogram_a[v_to_b_bin+1] - v_histogram_a[v_to_b_bin]); ELSE -- inner bins v_coeff_b := 1; END IF; v_cont_idx := (v_idx_b - 1) * (array_upper(v_histogram_a,1)-1) + v_idx_a; v_estimate := v_estimate + round(v_contingency[v_cont_idx] * v_coeff_a * v_coeff_b); END LOOP; END LOOP; RETURN v_estimate; END; $$ LANGUAGE plpgsql; /* independent columns col_a | col_b | actual | expected | 10x10 | 20x20 | [50,70] | [50,70] | 41 | 40 | 41 | 47 | [50,250] | [50,250] | 4023 | 4024 | 4436 | 3944 | [50,250] | [750,950] | 4023 | 3955 | 4509 | 3933 | */ INSERT INTO test_table SELECT round(random()*1000), round(random()*1000) FROM generate_series(1,100000); /* positively dependent columns col_a | col_b | actual | expected | 10x10 | 20x20 | 40x40 | 100x100 | [50,70] | [50,70] | 2143 | 57 | 391 | 729 | 1468 | 1866 | [50,250] | [50,250] | 20181 | 4111 | 15401 | 19983 | 19985 | 19991 | [50,250] | [750,950] | 0 | 3977 | 0 | 0 | 0 | 0 | */ INSERT INTO test_table SELECT val, val FROM (SELECT round(random()*1000) AS val FROM generate_series(1,100000)) foo;