patternsqlModerate
Find parent rows that have identical sets of child rows
Viewed 0 times
rowsparentidenticalthatchildfindsetshave
Problem
Suppose I have a structure like this:
Recipes table
RecipeIngredients table
The key on
What are some good ways for finding duplicate recipes? A duplicate recipe is defined as having the exact same set of ingredients and quantities for each ingredient.
I've thought of using
There are 48K recipes and 200K ingredient rows.
Recipes table
RecipeID
Name
DescriptionRecipeIngredients table
RecipeID
IngredientID
Quantity
UOMThe key on
RecipeIngredients is (RecipeID, IngredientID).What are some good ways for finding duplicate recipes? A duplicate recipe is defined as having the exact same set of ingredients and quantities for each ingredient.
I've thought of using
FOR XML PATH to combine the ingredients into a single column. I haven't fully explored this but it should work if I make sure the ingredients/UOMs/quantities are sorted in the same sequence and have a proper separator. Are there better approaches?There are 48K recipes and 200K ingredient rows.
Solution
This is a generalization of the relational division problem. No idea how efficient this will be:
Another (similar) approach:
And another, different one:
Tested at SQL-Fiddle
Using the
(ignore this as it may give false positives)
; WITH cte AS
( SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID <> r2.RecipeID
WHERE NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
AND NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
AND ri1.IngredientID = ri2.IngredientID
AND ri1.Quantity = ri2.Quantity
AND ri1.UOM = ri2.UOM
)
)
)
SELECT c1.*
FROM cte AS c1
JOIN cte AS c2
ON c1.RecipeID_1 = c2.RecipeID_2
AND c1.RecipeID_2 = c2.RecipeID_1
AND c1.RecipeID_1 < c1.RecipeID_2;Another (similar) approach:
SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
)
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
) ;And another, different one:
; WITH cte AS
( SELECT RecipeID_1 = r.RecipeID, RecipeID_2 = ri.RecipeID,
ri.IngredientID, ri.Quantity, ri.UOM
FROM Recipes AS r
CROSS JOIN RecipeIngredients AS ri
)
, cte2 AS
( SELECT RecipeID_1, RecipeID_2,
IngredientID, Quantity, UOM
FROM cte
EXCEPT
SELECT RecipeID_2, RecipeID_1,
IngredientID, Quantity, UOM
FROM cte
)
SELECT RecipeID_1 = r1.RecipeID, RecipeID_2 = r2.RecipeID
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
EXCEPT
SELECT RecipeID_1, RecipeID_2
FROM cte2
EXCEPT
SELECT RecipeID_2, RecipeID_1
FROM cte2 ;Tested at SQL-Fiddle
Using the
CHECKSUM() and CHECKSUM_AGG() functions, test at SQL-Fiddle-2:(ignore this as it may give false positives)
ALTER TABLE RecipeIngredients
ADD ck AS CHECKSUM( IngredientID, Quantity, UOM )
PERSISTED ;
CREATE INDEX ckecksum_IX
ON RecipeIngredients
( RecipeID, ck ) ;
; WITH cte AS
( SELECT RecipeID,
cka = CHECKSUM_AGG(ck)
FROM RecipeIngredients AS ri
GROUP BY RecipeID
)
SELECT RecipeID_1 = c1.RecipeID, RecipeID_2 = c2.RecipeID
FROM cte AS c1
JOIN cte AS c2
ON c1.cka = c2.cka
AND c1.RecipeID < c2.RecipeID ;Code Snippets
; WITH cte AS
( SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID <> r2.RecipeID
WHERE NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
AND NOT EXISTS
( SELECT 1
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
AND ri1.IngredientID = ri2.IngredientID
AND ri1.Quantity = ri2.Quantity
AND ri1.UOM = ri2.UOM
)
)
)
SELECT c1.*
FROM cte AS c1
JOIN cte AS c2
ON c1.RecipeID_1 = c2.RecipeID_2
AND c1.RecipeID_2 = c2.RecipeID_1
AND c1.RecipeID_1 < c1.RecipeID_2;SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
RecipeID_2 = r2.RecipeID, Name_2 = r2.Name
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
)
AND NOT EXISTS
( SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri2
WHERE ri2.RecipeID = r2.RecipeID
EXCEPT
SELECT IngredientID, Quantity, UOM
FROM RecipeIngredients AS ri1
WHERE ri1.RecipeID = r1.RecipeID
) ;; WITH cte AS
( SELECT RecipeID_1 = r.RecipeID, RecipeID_2 = ri.RecipeID,
ri.IngredientID, ri.Quantity, ri.UOM
FROM Recipes AS r
CROSS JOIN RecipeIngredients AS ri
)
, cte2 AS
( SELECT RecipeID_1, RecipeID_2,
IngredientID, Quantity, UOM
FROM cte
EXCEPT
SELECT RecipeID_2, RecipeID_1,
IngredientID, Quantity, UOM
FROM cte
)
SELECT RecipeID_1 = r1.RecipeID, RecipeID_2 = r2.RecipeID
FROM Recipes AS r1
JOIN Recipes AS r2
ON r1.RecipeID < r2.RecipeID
EXCEPT
SELECT RecipeID_1, RecipeID_2
FROM cte2
EXCEPT
SELECT RecipeID_2, RecipeID_1
FROM cte2 ;ALTER TABLE RecipeIngredients
ADD ck AS CHECKSUM( IngredientID, Quantity, UOM )
PERSISTED ;
CREATE INDEX ckecksum_IX
ON RecipeIngredients
( RecipeID, ck ) ;
; WITH cte AS
( SELECT RecipeID,
cka = CHECKSUM_AGG(ck)
FROM RecipeIngredients AS ri
GROUP BY RecipeID
)
SELECT RecipeID_1 = c1.RecipeID, RecipeID_2 = c2.RecipeID
FROM cte AS c1
JOIN cte AS c2
ON c1.cka = c2.cka
AND c1.RecipeID < c2.RecipeID ;Context
StackExchange Database Administrators Q#45302, answer score: 10
Revisions (0)
No revisions yet.