patternsqlMinor
Merge multiple rows into fewest number of rows of distinct values
Viewed 0 times
rowsdistinctnumbermergeintofewestmultiplevalues
Problem
In SQL Server does anyone know of a nice way to merge/flatten multiple rows of data into the fewest possible rows containing only the distinct non null values.
I.e.
A dataset like this:
As this:
If it helps, the before dataset is a pivoted line listing but without the aggregate. I can't aggregate it during the pivot as I want to keep each of the distinct values and not take the MAX or MIN.
The only way I can think of doing it involves splitting the data up and joining it all back together which wont be very efficient.
I.e.
A dataset like this:
As this:
If it helps, the before dataset is a pivoted line listing but without the aggregate. I can't aggregate it during the pivot as I want to keep each of the distinct values and not take the MAX or MIN.
The only way I can think of doing it involves splitting the data up and joining it all back together which wont be very efficient.
Solution
Here are a couple ways you might approach the task by assigning an arbitrary row number to each of the non-NULL values for each column (avoiding extraneous sorts since the problem statement does not require ordered data) and then pivoting the resulting data into your final result set.
Create test data
Perform the desired merge and flatten with one scan of the data per column and only a single sort (or hash group) on the final set of rows
In SQL Server 2017+, perform the desired merge and flatten with a single scan over the data and no sorting
Visualize the test data
Visualize the results
Create test data
CREATE TABLE #table (
ID INT NOT NULL,
Col1 VARCHAR(100) NULL,
Col2 VARCHAR(100) NULL,
Col3 VARCHAR(100) NULL,
Col4 VARCHAR(100) NULL,
Col5 VARCHAR(100) NULL
)
GO
INSERT INTO #table (ID, Col1, Col2, Col3, Col4, Col5)
VALUES (37850, '1A', NULL, '3A', NULL, '5A'),
(37850, NULL, NULL, '3B', NULL, NULL),
(37850, NULL, '2A', '3C', '4A', '5B'),
(37850, NULL, NULL, NULL, NULL, NULL),
(37850, NULL, NULL, NULL, NULL, '5C'),
(37850, NULL, '2B', NULL, NULL, NULL),
(37850, NULL, NULL, NULL, NULL, '5D'),
(37850, NULL, NULL, NULL, NULL, NULL),
(37850, NULL, NULL, NULL, '4B', '5E'),
(37850, NULL, NULL, NULL, '4C', NULL),
(37850, NULL, NULL, NULL, '4D', NULL)
GO
CREATE CLUSTERED INDEX CI ON #table (ID)
GOPerform the desired merge and flatten with one scan of the data per column and only a single sort (or hash group) on the final set of rows
SELECT ID,
-- Pivot the data for each ID / row number pair,
MIN(CASE WHEN colNumber = 1 THEN val END) AS Col1,
MIN(CASE WHEN colNumber = 2 THEN val END) AS Col2,
MIN(CASE WHEN colNumber = 3 THEN val END) AS Col3,
MIN(CASE WHEN colNumber = 4 THEN val END) AS Col4,
MIN(CASE WHEN colNumber = 5 THEN val END) AS Col5
FROM (
-- Within each ID, assign an arbitrary row number to each non-NULL column value
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 1 AS colNumber, Col1 AS val FROM #table WHERE Col1 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 2 AS colNumber, Col2 AS val FROM #table WHERE Col2 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 3 AS colNumber, Col3 AS val FROM #table WHERE Col3 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 4 AS colNumber, Col4 AS val FROM #table WHERE Col4 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 5 AS colNumber, Col5 AS val FROM #table WHERE Col5 IS NOT NULL
) x
GROUP BY ID, rowNum
GOIn SQL Server 2017+, perform the desired merge and flatten with a single scan over the data and no sorting
SELECT s.ID,
-- Pivot the data for each ID / row number pair,
MIN(CASE WHEN v.colNum = 1 THEN v.value END) AS Col1,
MIN(CASE WHEN v.colNum = 2 THEN v.value END) AS Col2,
MIN(CASE WHEN v.colNum = 3 THEN v.value END) AS Col3,
MIN(CASE WHEN v.colNum = 4 THEN v.value END) AS Col4,
MIN(CASE WHEN v.colNum = 5 THEN v.value END) AS Col5
FROM (
-- For each ID, build the list of non-NULL values, using
-- a delimiter that will not exist in your data
SELECT t.ID,
STRING_AGG(t.Col1,CHAR(0)) as stringAgg1,
STRING_AGG(t.Col2,CHAR(0)) as stringAgg2,
STRING_AGG(t.Col3,CHAR(0)) as stringAgg3,
STRING_AGG(t.Col4,CHAR(0)) as stringAgg4,
STRING_AGG(t.Col5,CHAR(0)) as stringAgg5
FROM #table t
GROUP BY t.ID
) s
OUTER APPLY (
-- For each ID, unpivot the list of non-NULL values,
-- appending an arbitrary row number to each value
SELECT value, 1 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg1,CHAR(0)) UNION ALL
SELECT value, 2 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg2,CHAR(0)) UNION ALL
SELECT value, 3 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg3,CHAR(0)) UNION ALL
SELECT value, 4 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg4,CHAR(0)) UNION ALL
SELECT value, 5 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg5,CHAR(0))
) v
-- For each ID, group together all column values with same row number
GROUP BY s.ID, v.rn
GOVisualize the test data
Visualize the results
Code Snippets
CREATE TABLE #table (
ID INT NOT NULL,
Col1 VARCHAR(100) NULL,
Col2 VARCHAR(100) NULL,
Col3 VARCHAR(100) NULL,
Col4 VARCHAR(100) NULL,
Col5 VARCHAR(100) NULL
)
GO
INSERT INTO #table (ID, Col1, Col2, Col3, Col4, Col5)
VALUES (37850, '1A', NULL, '3A', NULL, '5A'),
(37850, NULL, NULL, '3B', NULL, NULL),
(37850, NULL, '2A', '3C', '4A', '5B'),
(37850, NULL, NULL, NULL, NULL, NULL),
(37850, NULL, NULL, NULL, NULL, '5C'),
(37850, NULL, '2B', NULL, NULL, NULL),
(37850, NULL, NULL, NULL, NULL, '5D'),
(37850, NULL, NULL, NULL, NULL, NULL),
(37850, NULL, NULL, NULL, '4B', '5E'),
(37850, NULL, NULL, NULL, '4C', NULL),
(37850, NULL, NULL, NULL, '4D', NULL)
GO
CREATE CLUSTERED INDEX CI ON #table (ID)
GOSELECT ID,
-- Pivot the data for each ID / row number pair,
MIN(CASE WHEN colNumber = 1 THEN val END) AS Col1,
MIN(CASE WHEN colNumber = 2 THEN val END) AS Col2,
MIN(CASE WHEN colNumber = 3 THEN val END) AS Col3,
MIN(CASE WHEN colNumber = 4 THEN val END) AS Col4,
MIN(CASE WHEN colNumber = 5 THEN val END) AS Col5
FROM (
-- Within each ID, assign an arbitrary row number to each non-NULL column value
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 1 AS colNumber, Col1 AS val FROM #table WHERE Col1 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 2 AS colNumber, Col2 AS val FROM #table WHERE Col2 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 3 AS colNumber, Col3 AS val FROM #table WHERE Col3 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 4 AS colNumber, Col4 AS val FROM #table WHERE Col4 IS NOT NULL UNION ALL
SELECT ID, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY (SELECT 0) ASC) AS rowNum, 5 AS colNumber, Col5 AS val FROM #table WHERE Col5 IS NOT NULL
) x
GROUP BY ID, rowNum
GOSELECT s.ID,
-- Pivot the data for each ID / row number pair,
MIN(CASE WHEN v.colNum = 1 THEN v.value END) AS Col1,
MIN(CASE WHEN v.colNum = 2 THEN v.value END) AS Col2,
MIN(CASE WHEN v.colNum = 3 THEN v.value END) AS Col3,
MIN(CASE WHEN v.colNum = 4 THEN v.value END) AS Col4,
MIN(CASE WHEN v.colNum = 5 THEN v.value END) AS Col5
FROM (
-- For each ID, build the list of non-NULL values, using
-- a delimiter that will not exist in your data
SELECT t.ID,
STRING_AGG(t.Col1,CHAR(0)) as stringAgg1,
STRING_AGG(t.Col2,CHAR(0)) as stringAgg2,
STRING_AGG(t.Col3,CHAR(0)) as stringAgg3,
STRING_AGG(t.Col4,CHAR(0)) as stringAgg4,
STRING_AGG(t.Col5,CHAR(0)) as stringAgg5
FROM #table t
GROUP BY t.ID
) s
OUTER APPLY (
-- For each ID, unpivot the list of non-NULL values,
-- appending an arbitrary row number to each value
SELECT value, 1 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg1,CHAR(0)) UNION ALL
SELECT value, 2 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg2,CHAR(0)) UNION ALL
SELECT value, 3 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg3,CHAR(0)) UNION ALL
SELECT value, 4 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg4,CHAR(0)) UNION ALL
SELECT value, 5 AS colNum, ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rn FROM STRING_SPLIT(s.stringAgg5,CHAR(0))
) v
-- For each ID, group together all column values with same row number
GROUP BY s.ID, v.rn
GOContext
StackExchange Database Administrators Q#210150, answer score: 6
Revisions (0)
No revisions yet.