patternsqlMinor
Deduplicating value pairs between 2 columns
Viewed 0 times
deduplicatingcolumnsvaluebetweenpairs
Problem
This was asked in the SQL Helpline room:
While it seemed simple at first, it was surprisingly difficult to get it to work. In the end, I had to resort to a loop. Please mind, this looks like example code, but it is more like practice code, so that the logic can be applied to real situations.
I saved a demo here (which uses temp tables) and the query goes like this:
```
/ Assuming that @Origin is the source table your data currently is: /
declare @Origin table (val1 char(1), val2 char(1));
insert into @Origin (val1, val2)
values
('A', 'B'),
('A', 'C'),
('B', 'A'),
('B', 'C'),
('C', 'A'),
('C', 'B');
/* Create a temp table in memory:
Note: We could also use a physicial #holdingTable
if the result set is too large for memory */
declare @holdingTable table (
rownum int primary key,
val1 char(1),
val2 char(1)
);
insert into @holdingTable (
rownum,
val1,
val2
)
select
/ generate a rownum to use for looping: /
row_number() over (order by val1, val2),
/ add values from origin table: /
val1,
val2
from @Origin;
/ loop variables: /
declare @i int = (select min(rownum) from @holdingTable);
declare @max int = (select max(rownum) from @holdingTable);
/ deduplication variables: /
declare @val1Deduplicator char(1);
declare @val2Deduplicator char(1);
/ table to hold result set until end of query: /
declare @resultSet table (val1 char(1), val2 char(1));
while @i <= @max
begin
/ assign values to deduplicate across columns: /
select
/ inversion between val1 and val2 happens here: /
@val1Deduplicator = val2,
@val2Deduplicator = val1
from @holdingTable
where rownum = @i;
/ verify we are not inserting a duplicate: /
if not exists (
s
I have a sql query concern. Could you help me?
It goes like this. I have a table with the Records
A B
A C
B A
B C
C A
C B
How do I get only the records that there aren't duplicated in both columns
A B
A C
B CWhile it seemed simple at first, it was surprisingly difficult to get it to work. In the end, I had to resort to a loop. Please mind, this looks like example code, but it is more like practice code, so that the logic can be applied to real situations.
I saved a demo here (which uses temp tables) and the query goes like this:
```
/ Assuming that @Origin is the source table your data currently is: /
declare @Origin table (val1 char(1), val2 char(1));
insert into @Origin (val1, val2)
values
('A', 'B'),
('A', 'C'),
('B', 'A'),
('B', 'C'),
('C', 'A'),
('C', 'B');
/* Create a temp table in memory:
Note: We could also use a physicial #holdingTable
if the result set is too large for memory */
declare @holdingTable table (
rownum int primary key,
val1 char(1),
val2 char(1)
);
insert into @holdingTable (
rownum,
val1,
val2
)
select
/ generate a rownum to use for looping: /
row_number() over (order by val1, val2),
/ add values from origin table: /
val1,
val2
from @Origin;
/ loop variables: /
declare @i int = (select min(rownum) from @holdingTable);
declare @max int = (select max(rownum) from @holdingTable);
/ deduplication variables: /
declare @val1Deduplicator char(1);
declare @val2Deduplicator char(1);
/ table to hold result set until end of query: /
declare @resultSet table (val1 char(1), val2 char(1));
while @i <= @max
begin
/ assign values to deduplicate across columns: /
select
/ inversion between val1 and val2 happens here: /
@val1Deduplicator = val2,
@val2Deduplicator = val1
from @holdingTable
where rownum = @i;
/ verify we are not inserting a duplicate: /
if not exists (
s
Solution
If
Given:
If you just need the values, and not the actual rows, you can use
Result:
If you need the original rows instead you have to eliminate the duplicates yourself. This can be done using the normalized CTE and partitioning by the normalized values:
Result:
As for performance, the iterative approach is, according to the execution plan in SSMS14, about 10 times slower. The set based solution is a table scan + a sort where as the execution plan for the iterative solution is a mess to look at.
val1 and val2 are comparable it is easy to find the distinct values, using set-based operations, by normalizing first and then taking distinct rows.Given:
CREATE TABLE [TestTable]
(
[TestID] INT NOT NULL IDENTITY(1,1),
[Value1] NCHAR(1) NOT NULL,
[Value2] NCHAR(1) NOT NULL,
)
INSERT INTO [TestTable]
([Value1], [Value2])
VALUES
(N'A', N'B'),
(N'A', N'C'),
(N'B', N'A'),
(N'B', N'C'),
(N'C', N'A'),
(N'C', N'B'),
(N'D', N'C');If you just need the values, and not the actual rows, you can use
SELECT DISTINCT with a normalized CTE:WITH [Normalized] AS
(
SELECT
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value2]
ELSE [tt].[Value1] END [N1],
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value1]
ELSE [tt].[Value2] END [N2]
FROM
[TestTable] [tt]
)
SELECT DISTINCT
[n].[N1] [Value1],
[n].[N2] [Value2]
FROM
[Normalized] [n];Result:
Value1 Value2
------ ------
A B
A C
B C
C D
If you need the original rows instead you have to eliminate the duplicates yourself. This can be done using the normalized CTE and partitioning by the normalized values:
WITH [Normalized] AS
(
SELECT
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value2]
ELSE [tt].[Value1] END [N1],
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value1]
ELSE [tt].[Value2] END [N2],
[tt].[TestID],
[tt].[Value1],
[tt].[Value2]
FROM
[TestTable] [tt]
),
[Ranked] AS
(
SELECT
[n].[TestID],
[n].[Value1],
[n].[Value2],
ROW_NUMBER() OVER (PARTITION BY [n].[N1], [n].[N2]
ORDER BY [n].[TestID] ASC) AS [Rank]
FROM
[Normalized] [n]
),
[Distinct] AS
(
SELECT
[r].[TestID],
[r].[Value1],
[r].[Value2]
FROM
[Ranked] [r]
WHERE
[r].[Rank] = 1
)
SELECT [d].[TestID], [d].[Value1], [d].[Value2]
FROM
[Distinct] [d];Result:
TestID Value1 Value2
----------- ------ ------
1 A B
2 A C
4 B C
7 D C
As for performance, the iterative approach is, according to the execution plan in SSMS14, about 10 times slower. The set based solution is a table scan + a sort where as the execution plan for the iterative solution is a mess to look at.
Code Snippets
CREATE TABLE [TestTable]
(
[TestID] INT NOT NULL IDENTITY(1,1),
[Value1] NCHAR(1) NOT NULL,
[Value2] NCHAR(1) NOT NULL,
)
INSERT INTO [TestTable]
([Value1], [Value2])
VALUES
(N'A', N'B'),
(N'A', N'C'),
(N'B', N'A'),
(N'B', N'C'),
(N'C', N'A'),
(N'C', N'B'),
(N'D', N'C');WITH [Normalized] AS
(
SELECT
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value2]
ELSE [tt].[Value1] END [N1],
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value1]
ELSE [tt].[Value2] END [N2]
FROM
[TestTable] [tt]
)
SELECT DISTINCT
[n].[N1] [Value1],
[n].[N2] [Value2]
FROM
[Normalized] [n];WITH [Normalized] AS
(
SELECT
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value2]
ELSE [tt].[Value1] END [N1],
CASE WHEN [tt].[Value1] > [tt].[Value2]
THEN [tt].[Value1]
ELSE [tt].[Value2] END [N2],
[tt].[TestID],
[tt].[Value1],
[tt].[Value2]
FROM
[TestTable] [tt]
),
[Ranked] AS
(
SELECT
[n].[TestID],
[n].[Value1],
[n].[Value2],
ROW_NUMBER() OVER (PARTITION BY [n].[N1], [n].[N2]
ORDER BY [n].[TestID] ASC) AS [Rank]
FROM
[Normalized] [n]
),
[Distinct] AS
(
SELECT
[r].[TestID],
[r].[Value1],
[r].[Value2]
FROM
[Ranked] [r]
WHERE
[r].[Rank] = 1
)
SELECT [d].[TestID], [d].[Value1], [d].[Value2]
FROM
[Distinct] [d];Context
StackExchange Code Review Q#118013, answer score: 7
Revisions (0)
No revisions yet.