Re: Incrementally refreshed materialized view

From: Kevin Grittner <kgrittn(at)gmail(dot)com>
To: Rakesh Kumar <rakeshkumar464(at)outlook(dot)com>
Cc: Rob Sargent <robjsargent(at)gmail(dot)com>, "pgsql-general(at)postgresql(dot)org" <pgsql-general(at)postgresql(dot)org>
Subject: Re: Incrementally refreshed materialized view
Date: 2016-09-26 19:21:51
Message-ID: CACjxUsPsdqCCt0tzjFkWq8PaM9dWekwZagGZAZ7v1U6xeaAndQ@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-general

On Mon, Sep 26, 2016 at 2:04 PM, Rakesh Kumar
<rakeshkumar464(at)outlook(dot)com> wrote:

> Does PG have a concept of MV log, from where it can detect the
> delta changes and apply incremental changes quickly.

That is what I am trying to work toward with the patch I cited in
an earlier post. Once some variation of that is in, the actual
incremental maintenance can be build on top of it. To see an
example of what would be done with such a delta relation for a
simple MV, using the count algorithm, see below:

--------------------------------------------------------------------------
--
-- This file demonstrates how incremental maintenance of materialized views
-- is intended to work using the counting algorithm. TEMP tables are used
-- to mock up both snapshots of relations and delta relations which will
-- be created automatically "behind the scenes" as part of the algorithm.
--
-- Delta relations for both base tables and materialized views will need a
-- "count(t)" column to track the number of alternative derivations of the
-- tuple. The column is actually named that in this example. Where the
-- count is needed by the algorithm and a relation doesn't have it
-- (because, for example, it is a base table), 1 is used. This is kept in
-- the example for purposes of illustration. While it is explicitly added
-- to the materialized views for this example, the idea would be that
-- specifying these materialized views using DISTINCT would implicitly add
-- the "count(t)" column when incremental maintenance was specified.
--
-- The logic for a JOIN is that the resulting row should have the product
-- of multiplying the two input row counts. Calculating a delta for that
-- requires two such joins which are then processed by a UNION with special
-- semantics. Those semantics are achieved in this example by feeding
-- UNION ALL results to an outer SELECT which uses GROUP BY and HAVING
-- clauses.
--
-- This example dodges some complex and performance-sapping problems that
-- occur when duplicate rows may be present. It does so with a combination
-- of a PRIMARY KEY declaration in the base table and GROUP BY clauses in
-- the materialized views.
--
-- The temporary relation names in the example are chosen for clarity of
-- the example. If temporary relation names are actually needed in the
-- implementation, they would need to be different, probably based on the
-- related permanent relation OID, for length.
--
--------------------------------------------------------------------------

-- Set up the base table used for these materialized views.
CREATE TABLE link
(
src text not null,
dst text not null,
primary key (src, dst)
);
INSERT INTO link
VALUES ('a','b'),('b','c'),('b','e'),('a','d'),('d','c');

-- Display and capture "before" image.
SELECT * FROM link ORDER BY 1, 2;
CREATE TEMP TABLE link1 AS SELECT * FROM link;

-- Create and display the initial state of the "hop" MV.
CREATE MATERIALIZED VIEW hop AS
SELECT t1.src, t2.dst, count(*) AS "count(t)"
FROM link t1
JOIN link t2 ON (t2.src = t1.dst)
GROUP BY t1.src, t2.dst;
SELECT * FROM hop;

-- Execute a transaction that modifies the base table.
-- The assumption is that the changes will be applied shortly after commit.
BEGIN;
DELETE FROM link WHERE src = 'b' AND dst = 'e';
INSERT INTO link VALUES ('c','h'),('f','g');
COMMIT;

-- Display and capture "after" image.
SELECT * FROM link ORDER BY 1, 2;
CREATE TEMP TABLE link2 AS SELECT * FROM link;

-- Mock up the delta(link) table which will get created "behind the scenes".
CREATE TEMP TABLE "Δ(link)" (src text NOT NULL, dst text NOT NULL,
"count(t)" int NOT NULL);
INSERT INTO "Δ(link)" VALUES ('b','e',-1),('c','h',1),('f','g',1);
SELECT * FROM "Δ(link)" ORDER BY 1, 2;

-- Given link before and after images, and delta(link), calculate delta(hop).
-- This must happen while the "before" and "after" snapshots are still
registered,
-- but not necessarily in the same transaction as the modifications to
the base table.
CREATE TEMP TABLE "Δ(hop)"
(src text NOT NULL, dst text NOT NULL, "count(t)" int NOT NULL);
INSERT INTO "Δ(hop)"
SELECT src, dst, sum("count(t)") AS "count(t)"
FROM (
SELECT delta.src, before.dst, delta."count(t)" * 1
FROM "Δ(link)" delta
JOIN link1 before ON (before.src = delta.dst)
UNION ALL
SELECT after.src, delta.dst, 1 * delta."count(t)"
FROM link2 after
JOIN "Δ(link)" delta ON (delta.src = after.dst)
) x(src, dst, "count(t)")
GROUP BY src, dst
HAVING sum("count(t)") <> 0;
SELECT * FROM "Δ(hop)" ORDER BY 1, 2;

-- Once the MV delta has been generated, the snapshots can be released.
-- We're using temp tables for demonstration purposes, so drop those,
-- and the base table's delta.
DROP TABLE link1, link2, "Δ(link)";

-- At some later time the MV delta is processed "behind the scenes".
-- We can't do the demonstration maintenance against the MV, so copy it.
CREATE TEMP TABLE hop2 AS SELECT * FROM hop;
BEGIN;
DELETE FROM hop2 t1
USING "Δ(hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst
AND t1."count(t)" + t2."count(t)" = 0;
UPDATE hop2 t1
SET "count(t)" = t1."count(t)" + t2."count(t)"
FROM "Δ(hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst;
INSERT INTO hop2
SELECT * FROM "Δ(hop)" t1
WHERE "count(t)" > 0
AND NOT EXISTS (SELECT * FROM hop2 t2 WHERE t2.src = t1.src AND
t2.dst = t1.dst);
COMMIT;

-- Show that we got to the same result with incremental maintenance as
with REFRESH.
REFRESH MATERIALIZED VIEW hop;
SELECT * FROM hop ORDER BY 1, 2;
SELECT * FROM hop2 ORDER BY 1, 2;

-- Now we're done with the MV delta.
DROP TABLE hop2, "Δ(hop)";

--------------------------------------------------------------------------

-- Let's show how it works when an MV uses another MV.
-- Create the 2nd-level MV.
CREATE MATERIALIZED VIEW tri_hop AS
SELECT t1.src, t2.dst, sum(t1."count(t)" * 1) AS "count(t)"
FROM hop t1
JOIN link t2 ON (t2.src = t1.dst)
GROUP BY t1.src, t2.dst;
SELECT * FROM tri_hop ORDER BY 1, 2;

-- Display and capture "before" image.
-- Include hop now, because it is referenced by tri_hop.
SELECT * FROM link ORDER BY 1, 2;
CREATE TEMP TABLE link1 AS SELECT * FROM link;
SELECT * FROM hop ORDER BY 1, 2;
CREATE TEMP TABLE hop1 AS SELECT * FROM hop;

-- Execute a transaction that modifies the base table.
-- The assumption is that the changes will be applied shortly after commit.
BEGIN;
DELETE FROM link WHERE src = 'a' AND dst = 'b';
INSERT INTO link VALUES ('d','f'),('b','h');
COMMIT;

-- Display and capture "after" image.
SELECT * FROM link ORDER BY 1, 2;
CREATE TEMP TABLE link2 AS SELECT * FROM link;

-- Mock up the delta(link) table which will get created "behind the scenes".
CREATE TEMP TABLE "Δ(link)" (src text NOT NULL, dst text NOT NULL,
"count(t)" int NOT NULL);
INSERT INTO "Δ(link)" VALUES ('a','b',-1),('d','f',1),('b','h',1);
SELECT * FROM "Δ(link)" ORDER BY 1, 2;

-- Given link before and after images, and delta(link), calculate delta(hop).
-- This must happen while the "before" and "after" snapshots are still
registered,
-- but not necessarily in the same transaction as the modifications to
the base table.
-- For a "first level" MV, this is calculated just the same as before,
but it will be used
-- to calculate the 2nd level MV before we discard the snapshots.
CREATE TEMP TABLE "Δ(hop)"
(src text NOT NULL, dst text NOT NULL, "count(t)" int NOT NULL);
INSERT INTO "Δ(hop)"
SELECT src, dst, sum("count(t)") AS "count(t)"
FROM (
SELECT delta.src, before.dst, delta."count(t)" * 1
FROM "Δ(link)" delta
JOIN link1 before ON (before.src = delta.dst)
UNION ALL
SELECT after.src, delta.dst, 1 * delta."count(t)"
FROM link2 after
JOIN "Δ(link)" delta ON (delta.src = after.dst)
) x(src, dst, "count(t)")
GROUP BY src, dst
HAVING sum("count(t)") <> 0;
SELECT * FROM "Δ(hop)" ORDER BY 1, 2;

-- Since the counting algorithm requires the link table "before"
image, as well as
-- the hop MV "after" image and delta(hop) to calculate delta(tri_hop), we must
-- maintain hop before releasing the snapshots used to update link.
CREATE TEMP TABLE hop2 AS SELECT * FROM hop;
BEGIN;
DELETE FROM hop2 t1
USING "Δ(hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst
AND t1."count(t)" + t2."count(t)" = 0;
UPDATE hop2 t1
SET "count(t)" = t1."count(t)" + t2."count(t)"
FROM "Δ(hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst;
INSERT INTO hop2
SELECT * FROM "Δ(hop)" t1
WHERE "count(t)" > 0
AND NOT EXISTS (SELECT * FROM hop2 t2 WHERE t2.src = t1.src AND
t2.dst = t1.dst);
COMMIT;

-- Show that we got to the same result with incremental maintenance as
with REFRESH.
REFRESH MATERIALIZED VIEW hop;
SELECT * FROM hop ORDER BY 1, 2;
SELECT * FROM hop2 ORDER BY 1, 2;

-- After the 1st level MV is run, we can calculate the delta for the 2nd level.
CREATE TEMP TABLE "Δ(tri_hop)"
(src text NOT NULL, dst text NOT NULL, "count(t)" int NOT NULL);
INSERT INTO "Δ(tri_hop)"
SELECT src, dst, sum("count(t)") AS "count(t)"
FROM (
SELECT delta.src, before.dst, delta."count(t)" * 1
FROM "Δ(hop)" delta
JOIN link1 before ON (before.src = delta.dst)
UNION ALL
SELECT after.src, delta.dst, 1 * delta."count(t)"
FROM hop2 after
JOIN "Δ(link)" delta ON (delta.src = after.dst)
) x(src, dst, "count(t)")
GROUP BY src, dst
HAVING sum("count(t)") <> 0;
SELECT * FROM "Δ(tri_hop)" ORDER BY 1, 2;

-- Now we're done with snapshots and all but the highest-level delta.
DROP TABLE link1, link2, "Δ(link)";
DROP TABLE hop1, hop2, "Δ(hop)";

-- At some later time the MV delta is processed "behind the scenes".
-- We can't do the demonstration maintenance against the MV, so copy it.
CREATE TEMP TABLE tri_hop2 AS SELECT * FROM tri_hop;
BEGIN;
DELETE FROM tri_hop2 t1
USING "Δ(tri_hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst
AND t1."count(t)" + t2."count(t)" = 0;
UPDATE tri_hop2 t1
SET "count(t)" = t1."count(t)" + t2."count(t)"
FROM "Δ(tri_hop)" t2
WHERE t1.src = t2.src
AND t1.dst = t2.dst;
INSERT INTO tri_hop2
SELECT * FROM "Δ(tri_hop)" t1
WHERE "count(t)" > 0
AND NOT EXISTS (SELECT * FROM tri_hop2 t2 WHERE t2.src = t1.src
AND t2.dst = t1.dst);
COMMIT;

-- Show that we got to the same result with incremental maintenance as
with REFRESH.
REFRESH MATERIALIZED VIEW tri_hop;
SELECT * FROM tri_hop ORDER BY 1, 2;
SELECT * FROM tri_hop2 ORDER BY 1, 2;

-- Now we're done with the MV delta.
DROP TABLE tri_hop2, "Δ(tri_hop)";

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

In response to

Responses

Browse pgsql-general by date

  From Date Subject
Next Message Tom Lane 2016-09-26 19:38:03 Re: [GENERAL] inconsistent behaviour of set-returning functions in sub-query with random()
Previous Message Alexander Farber 2016-09-26 19:15:35 Re: Custom SQL function does not like IF-statement