-- create table zip_codes_usa (zip_code char(5), latitude float, longitude float, zip_class char(1), post_name varchar(28), state char(2), county char(3)); insert into zip_codes_usa select zip_code, latitude::float, longitude::float, zip_class, poname, state, county from zipnov99 ; -- table used to store statistics CREATE TABLE cross_stats ( columns VARCHAR[], ndistinct INT[] ); /* * Collects statistics. * * This is a very stupid / slow implementation. */ CREATE OR REPLACE FUNCTION collect_stats(p_col_a VARCHAR, p_col_b VARCHAR) RETURNS void AS $$ DECLARE v_columns VARCHAR[]; v_dist_total INT; v_dist_a INT; v_dist_b INT; v_distinct INT[]; BEGIN /* columns */ v_columns := array_append(v_columns, p_col_a); v_columns := array_append(v_columns, p_col_b); RAISE NOTICE 'counting distinct values ...'; EXECUTE 'SELECT COUNT(DISTINCT ' || p_col_a || ') dist_a, COUNT(DISTINCT ' || p_col_b || ') dist_b, COUNT(DISTINCT ' || p_col_a || ' || '':'' || ' || p_col_b || ') dist_total FROM zip_codes_usa' INTO v_dist_a, v_dist_b, v_dist_total; v_distinct := array_append(v_distinct, v_dist_a); v_distinct := array_append(v_distinct, v_dist_b); v_distinct := array_append(v_distinct, v_dist_total); -- save stats DELETE FROM cross_stats; INSERT INTO cross_stats VALUES (v_columns, v_distinct); END; $$ LANGUAGE plpgsql; -- compute the estimate when there are range conditions on both columns, i.e. something like -- ... WHERE (col_a = '40999') AND (col_b = '029') CREATE OR REPLACE FUNCTION get_estimate(p_value_a VARCHAR, p_value_b VARCHAR) RETURNS INT[] AS $$ DECLARE -- the estimate v_estimate FLOAT; v_estimates INT[]; -- coefficients v_count FLOAT; v_coeffs FLOAT[]; v_ndistinct INT[]; v_columns VARCHAR[]; BEGIN SELECT columns, ndistinct INTO v_columns, v_ndistinct FROM cross_stats; SELECT reltuples INTO v_count FROM pg_class WHERE relname = 'zip_codes_usa'; v_estimate := v_count * get_frequency('zip_codes_usa', v_columns[1], p_value_a); v_estimates := array_append(v_estimates, round(v_estimate)::int); RAISE NOTICE 'estimate for column % is %',v_columns[1],v_estimate; v_estimate := v_count * get_frequency('zip_codes_usa', v_columns[2], p_value_b); v_estimates := array_append(v_estimates, round(v_estimate)::int); RAISE NOTICE 'estimate for column % is %',v_columns[2],v_estimate; -- heuristics (not part of the solution described in the article) IF (v_ndistinct[1] = v_ndistinct[3]) THEN v_estimate := v_estimates[1]; ELSIF (v_ndistinct[1] = v_ndistinct[3]) THEN v_estimate := v_estimates[2]; ELSE v_estimate := (v_ndistinct[1]::float / v_ndistinct[3]) * v_estimates[1] + (v_ndistinct[2]::float / v_ndistinct[3]) * v_estimates[2]; END IF; v_estimates := array_append(v_estimates, round(v_estimate)::int); RAISE NOTICE 'combined estimate is %',v_estimate; RETURN v_estimates; END; $$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION get_frequency(p_table VARCHAR, p_column VARCHAR, p_value VARCHAR) RETURNS FLOAT AS $$ DECLARE v_values VARCHAR[]; v_freqs FLOAT[]; v_freq FLOAT := 0; v_ndist INT; BEGIN SELECT n_distinct, most_common_vals, most_common_freqs INTO v_ndist, v_values, v_freqs FROM pg_stats WHERE tablename = p_table AND attname = p_column; IF (v_values IS NULL) THEN v_freq := (1::float / v_ndist); RAISE NOTICE 'frequency for column % is %', p_column, v_freq; RETURN v_freq; END IF; -- is it one of the MCVs? FOR v_idx IN 1..array_upper(v_values,1) LOOP IF (v_values[v_idx] = p_value) THEN v_freq := v_freqs[v_idx]; RAISE NOTICE 'frekvency for column % is % (from MCV)', p_column, v_freq; RETURN v_freqs[v_idx]; END IF; v_freq := v_freq + v_freqs[v_idx]; END LOOP; v_ndist := (v_ndist - array_upper(v_values,1)); IF (v_ndist <= 0) THEN v_ndist := 1; END IF; v_freq := (1 - v_freq) / v_ndist; RAISE NOTICE 'frequency for colunm % is %', p_column, v_freq; RETURN v_freq; END; $$ LANGUAGE plpgsql;