--- plpgsql.sgml.orig 2004-12-29 15:48:53.089973005 +1300 +++ plpgsql.sgml 2004-12-29 12:43:50.000000000 +1300 @@ -2632,6 +2632,306 @@ ; + + + An area where triggers can be useful is maintaining a summary table + of another table. The resulting summary can be used in place of the + original table for certain queries - with often vastly reduced run + times. + + + + This technique is commonly used in Data Warehousing, where the tables + of measured or observed data (called fact tables) can be extremely large. + + + + A <application>PL/pgSQL</application> Trigger Procedure For Maintaining A Summary Table + + + shows an example of a + trigger procedure in PL/pgSQL that maintains + a summary table for a fact table in a data warehouse. + + + + The schema detailed here is loosely based on the Grocery Store + example from The Data Warehouse Toolkit + by Ralph Kimball. + + + +-- +-- Three dimension tables. +-- +CREATE TABLE time_dimension ( + time_key integer NOT NULL, + day_of_week integer NOT NULL, + day_of_month integer NOT NULL, + month integer NOT NULL, + quarter integer NOT NULL, + year integer NOT NULL +); + +CREATE TABLE product_dimension ( + product_key integer NOT NULL, + description varchar(100) NOT NULL, + brand varchar(50) NOT NULL, + catageory varchar(20) NOT NULL +); + +CREATE TABLE store_dimension ( + store_key integer NOT NULL, + store_name varchar(100) NOT NULL, + address varchar(100) NOT NULL +); + + +-- +-- Sales fact. +-- +CREATE TABLE sales_fact ( + time_key integer NOT NULL, + product_key integer NOT NULL, + store_key integer NOT NULL, + amount_sold numeric(12,2) NOT NULL, + units_sold integer NOT NULL, + amount_cost numeric(12,2) NOT NULL +); + + +-- +-- Sales summary. +-- +CREATE TABLE sales_summary_bytime ( + time_key integer NOT NULL, + amount_sold numeric(15,2) NOT NULL, + units_sold numeric(12) NOT NULL, + amount_cost numeric(15,2) NOT NULL +); + +-- +-- COPY in data. +-- +COPY time_dimension FROM '/var/dump/time.dat' DELIMITERS ','; +COPY product_dimension FROM '/var/dump/prod.dat' DELIMITERS ','; +COPY store_dimension FROM '/var/dump/store.dat' DELIMITERS ','; +COPY sales_fact FROM '/var/dump/sales.dat' DELIMITERS ','; + + +-- +-- Create indexes on the dimensions, facts and summary. +-- +CREATE UNIQUE INDEX time_dimension_key ON time_dimension(time_key); +CREATE INDEX time_dimension_year ON time_dimension(year); +CREATE UNIQUE INDEX product_dimension_key ON product_dimension(product_key); +CREATE UNIQUE INDEX store_dimension_key ON store_dimension(store_key); + +CREATE INDEX sales_fact_time ON sales_fact(time_key); +CREATE INDEX sales_fact_product ON sales_fact(product_key); +CREATE INDEX sales_fact_store ON sales_fact(store_key); + +CREATE UNIQUE INDEX sales_summary_bytime_key ON sales_summary_bytime(time_key); + + +-- +-- Collect statistics for the optimizer. +-- +ANALYZE VERBOSE time_dimension; +ANALYZE VERBOSE product_dimension; +ANALYZE VERBOSE store_dimension; +ANALYZE VERBOSE sales_fact; + + +-- +-- Pre populate (and collect statistics for) the summary table. +-- +INSERT INTO sales_summary_bytime ( + time_key, + amount_sold, + units_sold, + amount_cost) + SELECT f.time_key, + sum(f.amount_sold), + sum(f.units_sold), + sum(f.amount_cost) + FROM sales_fact f + GROUP BY f.time_key; + +ANALYZE VERBOSE sales_summary_bytime; + + +-- +-- Function to amend summarized column(s) on UPDATE, INSERT, DELETE. +-- +CREATE OR REPLACE FUNCTION maint_sales_summary_bytime() RETURNS TRIGGER AS $maint_sales_summary_bytime$ + DECLARE + delta_time_key integer; + delta_amount_sold numeric(15,2); + delta_units_sold numeric(12); + delta_amount_cost numeric(15,2); + BEGIN + + -- Work out the increment/decrement amount(s). + IF (TG_OP = 'DELETE') THEN + + delta_time_key = OLD.time_key; + delta_amount_sold = -1 * OLD.amount_sold; + delta_units_sold = -1 * OLD.units_sold; + delta_amount_cost = -1 * OLD.amount_cost; + + ELSIF (TG_OP = 'UPDATE') THEN + + -- forbid updates that change the time_key - + -- (probably not too onerous, as DELETE + INSERT is how most + -- changes will be made). + IF ( OLD.time_key != NEW.time_key) THEN + RAISE EXCEPTION 'Update of time_key : % -> % not allowed', OLD.time_key, NEW.time_key; + END IF; + + delta_time_key = OLD.time_key; + delta_amount_sold = NEW.amount_sold - OLD.amount_sold; + delta_units_sold = NEW.units_sold - OLD.units_sold; + delta_amount_cost = NEW.amount_cost - OLD.amount_cost; + + ELSIF (TG_OP = 'INSERT') THEN + + delta_time_key = NEW.time_key; + delta_amount_sold = NEW.amount_sold; + delta_units_sold = NEW.units_sold; + delta_amount_cost = NEW.amount_cost; + + END IF; + + + -- Update the summary row with the new values. + UPDATE sales_summary_bytime + SET amount_sold = amount_sold + delta_amount_sold, + units_sold = units_sold + delta_units_sold, + amount_cost = amount_cost + delta_amount_cost + WHERE time_key = delta_time_key; + + + -- There might have been no row with this time_key (e.g new data!). + IF (NOT FOUND) THEN + BEGIN + INSERT INTO sales_summary_bytime ( + time_key, + amount_sold, + units_sold, + amount_cost) + SELECT f.time_key, + sum(f.amount_sold), + sum(f.units_sold), + sum(f.amount_cost) + FROM sales_fact f + WHERE f.time_key = delta_time_key + GROUP BY f.time_key; + -- This query can potentially be very expensive if the trigger + -- is created on sales_fact without the time_key indexes. + -- Some care is needed to ensure that this situation does + -- *not* occur. + EXCEPTION + -- + -- Catch race condition when two transactions are adding data + -- for a new time_key. + -- + WHEN UNIQUE_VIOLATION THEN + UPDATE sales_summary_bytime + SET amount_sold = amount_sold + delta_amount_sold, + units_sold = units_sold + delta_units_sold, + amount_cost = amount_cost + delta_amount_cost + WHERE time_key = delta_time_key; + + END; + END IF; + RETURN NULL; + + END; +$maint_sales_summary_bytime$ LANGUAGE plpgsql; + + +-- +-- The trigger. +-- +CREATE TRIGGER maint_sales_summary_bytime +AFTER INSERT OR UPDATE OR DELETE ON sales_fact + FOR EACH ROW EXECUTE PROCEDURE maint_sales_summary_bytime() +; + + + The effectiveness of the summary technique can be easily demonstrated. In + the first case shown below, the base fact table is used. In the second the + summary table is substituted. The run times and plans shown are real. + + +dwexample=# +SELECT + d0.quarter, + sum(f.amount_sold) +FROM + time_dimension d0, + sales_fact f +WHERE d0.time_key = f.time_key +AND d0.year = 2004 +GROUP BY + d0.quarter +; + quarter | sum +---------+------------- + 2 | 90000000.00 + 1 | 90000000.00 + 4 | 90000000.00 + 3 | 90000000.00 +(4 rows) + +Time: 2898.236 ms + QUERY PLAN +----------------------------------------------------------------------------------------------------- + HashAggregate (cost=53237.46..53237.46 rows=1 width=14) + -> Nested Loop (cost=0.00..51447.46 rows=358001 width=14) + -> Index Scan using time_year on time_dimension d0 (cost=0.00..9.83 rows=358 width=8) + Index Cond: ("year" = 2004) + -> Index Scan using sales_fact_time on sales_fact f (cost=0.00..96.72 rows=3757 width=14) + Index Cond: ("outer".time_key = f.time_key) +(6 rows) + + +dwexample=# +SELECT + d0.quarter, + sum(f.amount_sold) +FROM + time_dimension d0, + sales_summary_bytime f +WHERE d0.time_key = f.time_key +AND d0.year = 2004 +GROUP BY + d0.quarter +; + quarter | sum +---------+------------- + 2 | 90000000.00 + 1 | 90000000.00 + 4 | 90000000.00 + 3 | 90000000.00 +(4 rows) + +Time: 28.459 ms + QUERY PLAN +------------------------------------------------------------------------------------------------------- + HashAggregate (cost=260.10..260.10 rows=1 width=14) + -> Hash Join (cost=10.72..258.31 rows=358 width=14) + Hash Cond: ("outer".time_key = "inner".time_key) + -> Seq Scan on sales_summary_bytime f (cost=0.00..194.00 rows=10000 width=14) + -> Hash (cost=9.83..9.83 rows=358 width=8) + -> Index Scan using time_year on time_dimension d0 (cost=0.00..9.83 rows=358 width=8) + Index Cond: ("year" = 2004) +(7 rows) + + + +