-- a test table - just two integer columns
-- we'll fill it with data, collect stats and see what is the estimated / actual number of rows
CREATE TABLE test_table (
    col_a INT,
    col_b INT
);

-- just to speed up the stats collection
CREATE INDEX test_table_a_idx ON test_table(col_a);
CREATE INDEX test_table_b_idx ON test_table(col_b);

-- table used to store statistics
CREATE TABLE cross_stats (
    histogram_a INT[],
    histogram_b INT[],
    cross_histogram INT[]
);

/*
* Collects statistics.
*
* This is a very stupid / slow implementation.
*/
CREATE OR REPLACE FUNCTION collect_stats(p_bins_a INT, p_bins_b INT) RETURNS void AS $$
DECLARE
  v_value INT;
  v_count INT;
  v_histogram_a INT[];
  v_histogram_b INT[];
  v_contingency_table INT[];
  v_row RECORD;
  v_bin_idx INT;
BEGIN

  -- count all rows
  SELECT count(*) INTO v_count FROM test_table;

  /* build histogram s */

  -- lower borders
  SELECT MIN(col_a) INTO v_value FROM test_table;
  v_histogram_a := array_append(v_histogram_a, v_value);

  SELECT MIN(col_b) INTO v_value FROM test_table;
  v_histogram_b := array_append(v_histogram_b, v_value);

  -- inner borders
  FOR v_idx IN 1..(p_bins_a-1) LOOP
      SELECT col_a INTO v_value FROM test_table ORDER BY col_a LIMIT 1 OFFSET floor(v_idx * v_count / p_bins_a);
      v_histogram_a := array_append(v_histogram_a, v_value);
  END LOOP;

  FOR v_idx IN 1..(p_bins_b-1) LOOP
      SELECT col_b INTO v_value FROM test_table ORDER BY col_b LIMIT 1 OFFSET floor(v_idx * v_count / p_bins_b);
      v_histogram_b := array_append(v_histogram_b, v_value);
  END LOOP;

  -- upper borders
  SELECT MAX(col_a) INTO v_value FROM test_table;
  v_histogram_a := array_append(v_histogram_a, v_value);

  SELECT MAX(col_b) INTO v_value FROM test_table;
  v_histogram_b := array_append(v_histogram_b, v_value);

  /* build the contingency table */
  
  -- init
  FOR v_idx_a IN 1..(p_bins_a*p_bins_b) LOOP
    v_contingency_table := array_append(v_contingency_table, 0);
  END LOOP;

  FOR v_row IN (SELECT * FROM test_table) LOOP
    v_bin_idx := get_contingency_bin(v_histogram_a, v_histogram_b, v_row.col_a, v_row.col_b);
    v_contingency_table[v_bin_idx] := v_contingency_table[v_bin_idx] + 1;
  END LOOP;

  -- save stats
  DELETE FROM cross_stats;
  INSERT INTO cross_stats VALUES (v_histogram_a, v_histogram_b, v_contingency_table);

END;
$$ LANGUAGE plpgsql;

-- get ID of the bin in a (linearized) contingency table
CREATE OR REPLACE FUNCTION get_contingency_bin(p_histogram_a INT[], p_histogram_b INT[], p_value_a INT, p_value_b INT) RETURNS INT AS $$
DECLARE
  v_idx_a INT;
  v_idx_b INT;
BEGIN

  v_idx_a := get_histogram_bin(p_histogram_a, p_value_a);
  v_idx_b := get_histogram_bin(p_histogram_b, p_value_b);

  RETURN (v_idx_b - 1) * (array_upper(p_histogram_a,1)-1) + v_idx_a;

END;
$$ LANGUAGE plpgsql;

-- get bin in a histogram
CREATE OR REPLACE FUNCTION get_histogram_bin(p_histogram INT[], p_value INT) RETURNS INT AS $$
DECLARE
  v_tmp INT;
BEGIN

  -- slow, bisection should be used ...
  FOR v_idx IN 1..(array_upper(p_histogram,1)-1) LOOP
    IF (p_value >= p_histogram[v_idx]) THEN
      v_tmp := v_idx;
    END IF;
  END LOOP;

  RETURN v_tmp;

END;
$$ LANGUAGE plpgsql;

-- compute the estimate when there are range conditions on both columns, i.e. something like
-- ... WHERE (col_a BETWEEN 40 AND 75) AND (col_b BETWEEN 75 AND 1293)
CREATE OR REPLACE FUNCTION get_estimate(p_from_a INT, p_to_a INT, p_from_b INT, p_to_b INT) RETURNS INT AS $$
DECLARE

  -- bin indexes for col_a
  v_from_a_bin INT;
  v_to_a_bin INT;

  -- bin indexes for col_b
  v_from_b_bin INT;
  v_to_b_bin INT;

  -- the estimate
  v_estimate INT := 0;

  -- histograms (loaded from cross_stats)
  v_histogram_a INT[];
  v_histogram_b INT[];
  v_contingency INT[];
  v_cont_idx INT;

  -- coefficients (used to compute area of a single bin)
  v_coeff_a FLOAT;
  v_coeff_b FLOAT;

BEGIN

  SELECT histogram_a INTO v_histogram_a FROM cross_stats;
  SELECT histogram_b INTO v_histogram_b FROM cross_stats;
  SELECT cross_histogram INTO v_contingency FROM cross_stats;

  v_from_a_bin := get_histogram_bin(v_histogram_a, p_from_a);
  v_to_a_bin := get_histogram_bin(v_histogram_a, p_to_a);

  v_from_b_bin := get_histogram_bin(v_histogram_b, p_from_b);
  v_to_b_bin := get_histogram_bin(v_histogram_b, p_to_b);

  FOR v_idx_a IN v_from_a_bin..v_to_a_bin LOOP

    IF (v_from_a_bin = v_to_a_bin) THEN
      -- single bin
      v_coeff_a := (p_to_a - p_from_a)::float / (v_histogram_a[v_from_a_bin+1] - v_histogram_a[v_from_a_bin]);
    ELSIF (v_idx_a = v_from_a_bin) THEN
      -- starting bin
      v_coeff_a := (v_histogram_a[v_from_a_bin+1] - p_from_a)::float / (v_histogram_a[v_from_a_bin+1] - v_histogram_a[v_from_a_bin]);
    ELSIF (v_idx_a = v_to_a_bin) THEN
      -- last bin
      v_coeff_a := (p_to_a - v_histogram_a[v_to_a_bin])::float / (v_histogram_a[v_to_a_bin+1] - v_histogram_a[v_to_a_bin]);
    ELSE
      -- inner bins
      v_coeff_a := 1;
    END IF;

    FOR v_idx_b IN v_from_b_bin..v_to_b_bin LOOP

      IF (v_from_b_bin = v_to_b_bin) THEN
	-- single bin
	v_coeff_b := (p_to_b - p_from_b)::float / (v_histogram_b[v_from_b_bin+1] - v_histogram_b[v_from_b_bin]);
      ELSIF (v_idx_b = v_from_b_bin) THEN
	-- starting bin
	v_coeff_b := (v_histogram_b[v_from_b_bin+1] - p_from_b)::float / (v_histogram_b[v_from_b_bin+1] - v_histogram_b[v_from_b_bin]);
      ELSIF (v_idx_a = v_to_a_bin) THEN
	-- last bin
	v_coeff_b := (p_to_b - v_histogram_a[v_to_b_bin])::float / (v_histogram_a[v_to_b_bin+1] - v_histogram_a[v_to_b_bin]);
      ELSE
	-- inner bins
	v_coeff_b := 1;
      END IF;

      v_cont_idx := (v_idx_b - 1) * (array_upper(v_histogram_a,1)-1) + v_idx_a;

      v_estimate := v_estimate + round(v_contingency[v_cont_idx] * v_coeff_a * v_coeff_b);

    END LOOP;
  END LOOP;

  RETURN v_estimate;

END;
$$ LANGUAGE plpgsql;

/*
  independent columns

   col_a   |   col_b   | actual | expected | 10x10 | 20x20 |
  [50,70]  |  [50,70]  |    41  |     40   |   41  |    47 |
  [50,250] |  [50,250] |  4023  |   4024   | 4436  |  3944 |
  [50,250] | [750,950] |  4023  |   3955   | 4509  |  3933 |

*/
INSERT INTO test_table SELECT round(random()*1000), round(random()*1000) FROM generate_series(1,100000);

/*
  positively dependent columns

   col_a   |   col_b   | actual | expected | 10x10 | 20x20 | 40x40 | 100x100 |
  [50,70]  |  [50,70]  |   2143 |     57   |   391 |   729 |  1468 |    1866 |
  [50,250] |  [50,250] |  20181 |   4111   | 15401 | 19983 | 19985 |   19991 |
  [50,250] | [750,950] |      0 |   3977   |     0 |     0 |     0 |       0 |

*/
INSERT INTO test_table SELECT val, val FROM (SELECT round(random()*1000) AS val FROM generate_series(1,100000)) foo;