patternsqlMajor
Combining separate ranges into largest possible contiguous ranges
Viewed 0 times
combiningintoseparaterangeslargestpossiblecontiguous
Problem
I'm trying to combine multiple date ranges (my load is about max 500, most cases 10) that may or may not overlap into the largest possible contiguous date ranges. For example:
Data:
Table looks like:
Desired results:
Visual representation:
Data:
CREATE TABLE test (
id SERIAL PRIMARY KEY NOT NULL,
range DATERANGE
);
INSERT INTO test (range) VALUES
(DATERANGE('2015-01-01', '2015-01-05')),
(DATERANGE('2015-01-01', '2015-01-03')),
(DATERANGE('2015-01-03', '2015-01-06')),
(DATERANGE('2015-01-07', '2015-01-09')),
(DATERANGE('2015-01-08', '2015-01-09')),
(DATERANGE('2015-01-12', NULL)),
(DATERANGE('2015-01-10', '2015-01-12')),
(DATERANGE('2015-01-10', '2015-01-12'));Table looks like:
id | range
----+-------------------------
1 | [2015-01-01,2015-01-05)
2 | [2015-01-01,2015-01-03)
3 | [2015-01-03,2015-01-06)
4 | [2015-01-07,2015-01-09)
5 | [2015-01-08,2015-01-09)
6 | [2015-01-12,)
7 | [2015-01-10,2015-01-12)
8 | [2015-01-10,2015-01-12)
(8 rows)Desired results:
combined
--------------------------
[2015-01-01, 2015-01-06)
[2015-01-07, 2015-01-09)
[2015-01-10, )Visual representation:
1 | =====
2 | ===
3 | ===
4 | ==
5 | =
6 | =============>
7 | ==
8 | ==
--+---------------------------
| ====== == ===============>Solution
Assumptions / Clarifications
The manual:
The built-in range types
canonical form that includes the lower bound and excludes the upper
bound; that is,
For other types (like
Solution with pure SQL
With CTEs for clarity:
Or, the same with subqueries, faster but less easy too read:
How?
Replace NULL bounds (unbounded) with +/-
Remember, the upper bound is always excluded.
In the outer
Or with one less subquery level, but flipping sort order:
Sort the window in the second step with
Related answer with more explanation:
Procedural solution with plpgsql
Works for any table / column name, but only for type
Procedural solutions with loops are typically slower, but in this special case I expect the function to be substantially faster since it only needs a single sequential scan:
Call:
The logic is similar to the SQL solutions, but we can make do with a single pass.
SQL Fiddle.
Related:
The usual drill for handling user input in dynamic SQL:
Index
For each of these solutions a plain (default) btree index on
- No need to differentiate between
infinityand open upper bound (upper(range) IS NULL). (You can have it either way, but it's simpler this way.)
- NULL vs.
infinityin PostgreSQL range types
- Since
dateis a discrete type, all ranges have default[)bounds.
The manual:
The built-in range types
int4range, int8range, and daterange all use acanonical form that includes the lower bound and excludes the upper
bound; that is,
[).For other types (like
tsrange!) I would enforce the same if possible:- Preventing adjacent/overlapping entries with EXCLUDE in PostgreSQL
Solution with pure SQL
With CTEs for clarity:
WITH a AS (
SELECT range
, COALESCE(lower(range),'-infinity') AS startdate
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
FROM test
)
, b AS (
SELECT *, lag(enddate) OVER (ORDER BY range) < startdate OR NULL AS step
FROM a
)
, c AS (
SELECT *, count(step) OVER (ORDER BY range) AS grp
FROM b
)
SELECT daterange(min(startdate), max(enddate)) AS range
FROM c
GROUP BY grp
ORDER BY 1;Or, the same with subqueries, faster but less easy too read:
SELECT daterange(min(startdate), max(enddate)) AS range
FROM (
SELECT *, count(step) OVER (ORDER BY range) AS grp
FROM (
SELECT *, lag(enddate) OVER (ORDER BY range) < startdate OR NULL AS step
FROM (
SELECT range
, COALESCE(lower(range),'-infinity') AS startdate
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
FROM test
) a
) b
) c
GROUP BY grp
ORDER BY 1;How?
a: While ordering by range, compute the running maximum of the upper bound (enddate) with a window function.Replace NULL bounds (unbounded) with +/-
infinity just to simplify (no special NULL cases).b: In the same sort order, if the previous enddate is earlier than startdate we have a gap and start a new range (step).Remember, the upper bound is always excluded.
c: Form groups (grp) by counting steps with another window function.In the outer
SELECT build ranges from lower to upper bound in each group. Voilá.Or with one less subquery level, but flipping sort order:
SELECT daterange(min(COALESCE(lower(range), '-infinity')), max(enddate)) AS range
FROM (
SELECT *, count(nextstart > enddate OR NULL) OVER (ORDER BY range DESC NULLS LAST) AS grp
FROM (
SELECT range
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
, lead(lower(range)) OVER (ORDER BY range) As nextstart
FROM test
) a
) b
GROUP BY grp
ORDER BY 1;Sort the window in the second step with
ORDER BY range DESC NULLS LAST (with NULLS LAST) to get perfectly inverted sort order. This should be cheaper (easier to produce, matches sort order of suggested index perfectly) and accurate for corner cases with rank IS NULL. See:- PostgreSQL sort by datetime asc, null first?
Related answer with more explanation:
- Compare multiple date ranges
Procedural solution with plpgsql
Works for any table / column name, but only for type
daterange.Procedural solutions with loops are typically slower, but in this special case I expect the function to be substantially faster since it only needs a single sequential scan:
CREATE OR REPLACE FUNCTION f_range_agg(_tbl text, _col text)
RETURNS SETOF daterange AS
$func$
DECLARE
_lower date;
_upper date;
_enddate date;
_startdate date;
BEGIN
FOR _lower, _upper IN EXECUTE
format(
$sql$
SELECT COALESCE(lower(t.%2$I),'-infinity') -- replace NULL with ...
, COALESCE(upper(t.%2$I), 'infinity') -- ... +/- infinity
FROM %1$I t
ORDER BY t.%2$I
$sql$, _tbl, _col)
LOOP
IF _lower > _enddate THEN -- return previous range
RETURN NEXT daterange(_startdate, _enddate);
SELECT _lower, _upper INTO _startdate, _enddate;
ELSIF _upper > _enddate THEN -- expand range
_enddate := _upper;
-- do nothing if _upper <= _enddate (range already included) ...
ELSIF _enddate IS NULL THEN -- init 1st round
SELECT _lower, _upper INTO _startdate, _enddate;
END IF;
END LOOP;
IF FOUND THEN -- return last row
RETURN NEXT daterange(_startdate, _enddate);
END IF;
END
$func$ LANGUAGE plpgsql;Call:
SELECT * FROM f_range_agg('test', 'range'); -- table and column nameThe logic is similar to the SQL solutions, but we can make do with a single pass.
SQL Fiddle.
Related:
- GROUP BY and aggregate sequential numeric values
The usual drill for handling user input in dynamic SQL:
- SQL injection in Postgres functions vs prepared queries
Index
For each of these solutions a plain (default) btree index on
range would be instrumental Code Snippets
WITH a AS (
SELECT range
, COALESCE(lower(range),'-infinity') AS startdate
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
FROM test
)
, b AS (
SELECT *, lag(enddate) OVER (ORDER BY range) < startdate OR NULL AS step
FROM a
)
, c AS (
SELECT *, count(step) OVER (ORDER BY range) AS grp
FROM b
)
SELECT daterange(min(startdate), max(enddate)) AS range
FROM c
GROUP BY grp
ORDER BY 1;SELECT daterange(min(startdate), max(enddate)) AS range
FROM (
SELECT *, count(step) OVER (ORDER BY range) AS grp
FROM (
SELECT *, lag(enddate) OVER (ORDER BY range) < startdate OR NULL AS step
FROM (
SELECT range
, COALESCE(lower(range),'-infinity') AS startdate
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
FROM test
) a
) b
) c
GROUP BY grp
ORDER BY 1;SELECT daterange(min(COALESCE(lower(range), '-infinity')), max(enddate)) AS range
FROM (
SELECT *, count(nextstart > enddate OR NULL) OVER (ORDER BY range DESC NULLS LAST) AS grp
FROM (
SELECT range
, max(COALESCE(upper(range), 'infinity')) OVER (ORDER BY range) AS enddate
, lead(lower(range)) OVER (ORDER BY range) As nextstart
FROM test
) a
) b
GROUP BY grp
ORDER BY 1;CREATE OR REPLACE FUNCTION f_range_agg(_tbl text, _col text)
RETURNS SETOF daterange AS
$func$
DECLARE
_lower date;
_upper date;
_enddate date;
_startdate date;
BEGIN
FOR _lower, _upper IN EXECUTE
format(
$sql$
SELECT COALESCE(lower(t.%2$I),'-infinity') -- replace NULL with ...
, COALESCE(upper(t.%2$I), 'infinity') -- ... +/- infinity
FROM %1$I t
ORDER BY t.%2$I
$sql$, _tbl, _col)
LOOP
IF _lower > _enddate THEN -- return previous range
RETURN NEXT daterange(_startdate, _enddate);
SELECT _lower, _upper INTO _startdate, _enddate;
ELSIF _upper > _enddate THEN -- expand range
_enddate := _upper;
-- do nothing if _upper <= _enddate (range already included) ...
ELSIF _enddate IS NULL THEN -- init 1st round
SELECT _lower, _upper INTO _startdate, _enddate;
END IF;
END LOOP;
IF FOUND THEN -- return last row
RETURN NEXT daterange(_startdate, _enddate);
END IF;
END
$func$ LANGUAGE plpgsql;SELECT * FROM f_range_agg('test', 'range'); -- table and column nameContext
StackExchange Database Administrators Q#100965, answer score: 38
Revisions (0)
No revisions yet.