--- plpgsql.sgml.orig 2004-12-29 15:48:53.089973005 +1300
+++ plpgsql.sgml 2004-12-29 12:43:50.000000000 +1300
@@ -2632,6 +2632,306 @@
;
+
+
+ An area where triggers can be useful is maintaining a summary table
+ of another table. The resulting summary can be used in place of the
+ original table for certain queries - with often vastly reduced run
+ times.
+
+
+
+ This technique is commonly used in Data Warehousing, where the tables
+ of measured or observed data (called fact tables) can be extremely large.
+
+
+
+ A PL/pgSQL Trigger Procedure For Maintaining A Summary Table
+
+
+ shows an example of a
+ trigger procedure in PL/pgSQL that maintains
+ a summary table for a fact table in a data warehouse.
+
+
+
+ The schema detailed here is loosely based on the Grocery Store
+ example from The Data Warehouse Toolkit
+ by Ralph Kimball.
+
+
+
+--
+-- Three dimension tables.
+--
+CREATE TABLE time_dimension (
+ time_key integer NOT NULL,
+ day_of_week integer NOT NULL,
+ day_of_month integer NOT NULL,
+ month integer NOT NULL,
+ quarter integer NOT NULL,
+ year integer NOT NULL
+);
+
+CREATE TABLE product_dimension (
+ product_key integer NOT NULL,
+ description varchar(100) NOT NULL,
+ brand varchar(50) NOT NULL,
+ catageory varchar(20) NOT NULL
+);
+
+CREATE TABLE store_dimension (
+ store_key integer NOT NULL,
+ store_name varchar(100) NOT NULL,
+ address varchar(100) NOT NULL
+);
+
+
+--
+-- Sales fact.
+--
+CREATE TABLE sales_fact (
+ time_key integer NOT NULL,
+ product_key integer NOT NULL,
+ store_key integer NOT NULL,
+ amount_sold numeric(12,2) NOT NULL,
+ units_sold integer NOT NULL,
+ amount_cost numeric(12,2) NOT NULL
+);
+
+
+--
+-- Sales summary.
+--
+CREATE TABLE sales_summary_bytime (
+ time_key integer NOT NULL,
+ amount_sold numeric(15,2) NOT NULL,
+ units_sold numeric(12) NOT NULL,
+ amount_cost numeric(15,2) NOT NULL
+);
+
+--
+-- COPY in data.
+--
+COPY time_dimension FROM '/var/dump/time.dat' DELIMITERS ',';
+COPY product_dimension FROM '/var/dump/prod.dat' DELIMITERS ',';
+COPY store_dimension FROM '/var/dump/store.dat' DELIMITERS ',';
+COPY sales_fact FROM '/var/dump/sales.dat' DELIMITERS ',';
+
+
+--
+-- Create indexes on the dimensions, facts and summary.
+--
+CREATE UNIQUE INDEX time_dimension_key ON time_dimension(time_key);
+CREATE INDEX time_dimension_year ON time_dimension(year);
+CREATE UNIQUE INDEX product_dimension_key ON product_dimension(product_key);
+CREATE UNIQUE INDEX store_dimension_key ON store_dimension(store_key);
+
+CREATE INDEX sales_fact_time ON sales_fact(time_key);
+CREATE INDEX sales_fact_product ON sales_fact(product_key);
+CREATE INDEX sales_fact_store ON sales_fact(store_key);
+
+CREATE UNIQUE INDEX sales_summary_bytime_key ON sales_summary_bytime(time_key);
+
+
+--
+-- Collect statistics for the optimizer.
+--
+ANALYZE VERBOSE time_dimension;
+ANALYZE VERBOSE product_dimension;
+ANALYZE VERBOSE store_dimension;
+ANALYZE VERBOSE sales_fact;
+
+
+--
+-- Pre populate (and collect statistics for) the summary table.
+--
+INSERT INTO sales_summary_bytime (
+ time_key,
+ amount_sold,
+ units_sold,
+ amount_cost)
+ SELECT f.time_key,
+ sum(f.amount_sold),
+ sum(f.units_sold),
+ sum(f.amount_cost)
+ FROM sales_fact f
+ GROUP BY f.time_key;
+
+ANALYZE VERBOSE sales_summary_bytime;
+
+
+--
+-- Function to amend summarized column(s) on UPDATE, INSERT, DELETE.
+--
+CREATE OR REPLACE FUNCTION maint_sales_summary_bytime() RETURNS TRIGGER AS $maint_sales_summary_bytime$
+ DECLARE
+ delta_time_key integer;
+ delta_amount_sold numeric(15,2);
+ delta_units_sold numeric(12);
+ delta_amount_cost numeric(15,2);
+ BEGIN
+
+ -- Work out the increment/decrement amount(s).
+ IF (TG_OP = 'DELETE') THEN
+
+ delta_time_key = OLD.time_key;
+ delta_amount_sold = -1 * OLD.amount_sold;
+ delta_units_sold = -1 * OLD.units_sold;
+ delta_amount_cost = -1 * OLD.amount_cost;
+
+ ELSIF (TG_OP = 'UPDATE') THEN
+
+ -- forbid updates that change the time_key -
+ -- (probably not too onerous, as DELETE + INSERT is how most
+ -- changes will be made).
+ IF ( OLD.time_key != NEW.time_key) THEN
+ RAISE EXCEPTION 'Update of time_key : % -> % not allowed', OLD.time_key, NEW.time_key;
+ END IF;
+
+ delta_time_key = OLD.time_key;
+ delta_amount_sold = NEW.amount_sold - OLD.amount_sold;
+ delta_units_sold = NEW.units_sold - OLD.units_sold;
+ delta_amount_cost = NEW.amount_cost - OLD.amount_cost;
+
+ ELSIF (TG_OP = 'INSERT') THEN
+
+ delta_time_key = NEW.time_key;
+ delta_amount_sold = NEW.amount_sold;
+ delta_units_sold = NEW.units_sold;
+ delta_amount_cost = NEW.amount_cost;
+
+ END IF;
+
+
+ -- Update the summary row with the new values.
+ UPDATE sales_summary_bytime
+ SET amount_sold = amount_sold + delta_amount_sold,
+ units_sold = units_sold + delta_units_sold,
+ amount_cost = amount_cost + delta_amount_cost
+ WHERE time_key = delta_time_key;
+
+
+ -- There might have been no row with this time_key (e.g new data!).
+ IF (NOT FOUND) THEN
+ BEGIN
+ INSERT INTO sales_summary_bytime (
+ time_key,
+ amount_sold,
+ units_sold,
+ amount_cost)
+ SELECT f.time_key,
+ sum(f.amount_sold),
+ sum(f.units_sold),
+ sum(f.amount_cost)
+ FROM sales_fact f
+ WHERE f.time_key = delta_time_key
+ GROUP BY f.time_key;
+ -- This query can potentially be very expensive if the trigger
+ -- is created on sales_fact without the time_key indexes.
+ -- Some care is needed to ensure that this situation does
+ -- *not* occur.
+ EXCEPTION
+ --
+ -- Catch race condition when two transactions are adding data
+ -- for a new time_key.
+ --
+ WHEN UNIQUE_VIOLATION THEN
+ UPDATE sales_summary_bytime
+ SET amount_sold = amount_sold + delta_amount_sold,
+ units_sold = units_sold + delta_units_sold,
+ amount_cost = amount_cost + delta_amount_cost
+ WHERE time_key = delta_time_key;
+
+ END;
+ END IF;
+ RETURN NULL;
+
+ END;
+$maint_sales_summary_bytime$ LANGUAGE plpgsql;
+
+
+--
+-- The trigger.
+--
+CREATE TRIGGER maint_sales_summary_bytime
+AFTER INSERT OR UPDATE OR DELETE ON sales_fact
+ FOR EACH ROW EXECUTE PROCEDURE maint_sales_summary_bytime()
+;
+
+
+ The effectiveness of the summary technique can be easily demonstrated. In
+ the first case shown below, the base fact table is used. In the second the
+ summary table is substituted. The run times and plans shown are real.
+
+
+dwexample=#
+SELECT
+ d0.quarter,
+ sum(f.amount_sold)
+FROM
+ time_dimension d0,
+ sales_fact f
+WHERE d0.time_key = f.time_key
+AND d0.year = 2004
+GROUP BY
+ d0.quarter
+;
+ quarter | sum
+---------+-------------
+ 2 | 90000000.00
+ 1 | 90000000.00
+ 4 | 90000000.00
+ 3 | 90000000.00
+(4 rows)
+
+Time: 2898.236 ms
+ QUERY PLAN
+-----------------------------------------------------------------------------------------------------
+ HashAggregate (cost=53237.46..53237.46 rows=1 width=14)
+ -> Nested Loop (cost=0.00..51447.46 rows=358001 width=14)
+ -> Index Scan using time_year on time_dimension d0 (cost=0.00..9.83 rows=358 width=8)
+ Index Cond: ("year" = 2004)
+ -> Index Scan using sales_fact_time on sales_fact f (cost=0.00..96.72 rows=3757 width=14)
+ Index Cond: ("outer".time_key = f.time_key)
+(6 rows)
+
+
+dwexample=#
+SELECT
+ d0.quarter,
+ sum(f.amount_sold)
+FROM
+ time_dimension d0,
+ sales_summary_bytime f
+WHERE d0.time_key = f.time_key
+AND d0.year = 2004
+GROUP BY
+ d0.quarter
+;
+ quarter | sum
+---------+-------------
+ 2 | 90000000.00
+ 1 | 90000000.00
+ 4 | 90000000.00
+ 3 | 90000000.00
+(4 rows)
+
+Time: 28.459 ms
+ QUERY PLAN
+-------------------------------------------------------------------------------------------------------
+ HashAggregate (cost=260.10..260.10 rows=1 width=14)
+ -> Hash Join (cost=10.72..258.31 rows=358 width=14)
+ Hash Cond: ("outer".time_key = "inner".time_key)
+ -> Seq Scan on sales_summary_bytime f (cost=0.00..194.00 rows=10000 width=14)
+ -> Hash (cost=9.83..9.83 rows=358 width=8)
+ -> Index Scan using time_year on time_dimension d0 (cost=0.00..9.83 rows=358 width=8)
+ Index Cond: ("year" = 2004)
+(7 rows)
+
+
+
+