HiveBrain v1.2.0
Get Started
← Back to all entries
patternsqlModerate

Find parent rows that have identical sets of child rows

Submitted by: @import:stackexchange-dba··
0
Viewed 0 times
rowsparentidenticalthatchildfindsetshave

Problem

Suppose I have a structure like this:
Recipes table

RecipeID
Name
Description


RecipeIngredients table

RecipeID
IngredientID
Quantity
UOM


The key on RecipeIngredients is (RecipeID, IngredientID).

What are some good ways for finding duplicate recipes? A duplicate recipe is defined as having the exact same set of ingredients and quantities for each ingredient.

I've thought of using FOR XML PATH to combine the ingredients into a single column. I haven't fully explored this but it should work if I make sure the ingredients/UOMs/quantities are sorted in the same sequence and have a proper separator. Are there better approaches?

There are 48K recipes and 200K ingredient rows.

Solution

This is a generalization of the relational division problem. No idea how efficient this will be:

; WITH cte AS
( SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
         RecipeID_2 = r2.RecipeID, Name_2 = r2.Name  
  FROM Recipes AS r1
    JOIN Recipes AS r2
      ON r1.RecipeID <> r2.RecipeID
  WHERE NOT EXISTS
        ( SELECT 1
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID 
            AND NOT EXISTS
                ( SELECT 1
                  FROM RecipeIngredients AS ri2
                  WHERE ri2.RecipeID = r2.RecipeID 
                    AND ri1.IngredientID = ri2.IngredientID
                    AND ri1.Quantity = ri2.Quantity
                    AND ri1.UOM = ri2.UOM
                )
         )
)
SELECT c1.*
FROM cte AS c1
  JOIN cte AS c2
    ON  c1.RecipeID_1 = c2.RecipeID_2
    AND c1.RecipeID_2 = c2.RecipeID_1
    AND c1.RecipeID_1 < c1.RecipeID_2;


Another (similar) approach:

SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
       RecipeID_2 = r2.RecipeID, Name_2 = r2.Name 
FROM Recipes AS r1
  JOIN Recipes AS r2
    ON  r1.RecipeID < r2.RecipeID 
    AND NOT EXISTS
        ( SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID
        EXCEPT 
          SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri2
          WHERE ri2.RecipeID = r2.RecipeID
        )
    AND NOT EXISTS
        ( SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri2
          WHERE ri2.RecipeID = r2.RecipeID
        EXCEPT 
          SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID
        ) ;


And another, different one:

; WITH cte AS
( SELECT RecipeID_1 = r.RecipeID, RecipeID_2 = ri.RecipeID, 
          ri.IngredientID, ri.Quantity, ri.UOM
  FROM Recipes AS r
    CROSS JOIN RecipeIngredients AS ri
)
, cte2 AS
( SELECT RecipeID_1, RecipeID_2,
         IngredientID, Quantity, UOM
  FROM cte
EXCEPT
  SELECT RecipeID_2, RecipeID_1,
         IngredientID, Quantity, UOM
  FROM cte
)

  SELECT RecipeID_1 = r1.RecipeID, RecipeID_2 = r2.RecipeID
  FROM Recipes AS r1
    JOIN Recipes AS r2
      ON r1.RecipeID < r2.RecipeID
EXCEPT 
  SELECT RecipeID_1, RecipeID_2
  FROM cte2
EXCEPT 
  SELECT RecipeID_2, RecipeID_1
  FROM cte2 ;


Tested at SQL-Fiddle

Using the CHECKSUM() and CHECKSUM_AGG() functions, test at SQL-Fiddle-2:

(ignore this as it may give false positives)

ALTER TABLE RecipeIngredients
  ADD ck AS CHECKSUM( IngredientID, Quantity, UOM )
    PERSISTED ;

CREATE INDEX ckecksum_IX
  ON RecipeIngredients
    ( RecipeID, ck ) ;

; WITH cte AS
( SELECT RecipeID,
         cka = CHECKSUM_AGG(ck)
  FROM RecipeIngredients AS ri
  GROUP BY RecipeID
)
SELECT RecipeID_1 = c1.RecipeID, RecipeID_2 = c2.RecipeID
FROM cte AS c1
  JOIN cte AS c2
    ON  c1.cka = c2.cka
    AND c1.RecipeID < c2.RecipeID  ;

Code Snippets

; WITH cte AS
( SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
         RecipeID_2 = r2.RecipeID, Name_2 = r2.Name  
  FROM Recipes AS r1
    JOIN Recipes AS r2
      ON r1.RecipeID <> r2.RecipeID
  WHERE NOT EXISTS
        ( SELECT 1
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID 
            AND NOT EXISTS
                ( SELECT 1
                  FROM RecipeIngredients AS ri2
                  WHERE ri2.RecipeID = r2.RecipeID 
                    AND ri1.IngredientID = ri2.IngredientID
                    AND ri1.Quantity = ri2.Quantity
                    AND ri1.UOM = ri2.UOM
                )
         )
)
SELECT c1.*
FROM cte AS c1
  JOIN cte AS c2
    ON  c1.RecipeID_1 = c2.RecipeID_2
    AND c1.RecipeID_2 = c2.RecipeID_1
    AND c1.RecipeID_1 < c1.RecipeID_2;
SELECT RecipeID_1 = r1.RecipeID, Name_1 = r1.Name,
       RecipeID_2 = r2.RecipeID, Name_2 = r2.Name 
FROM Recipes AS r1
  JOIN Recipes AS r2
    ON  r1.RecipeID < r2.RecipeID 
    AND NOT EXISTS
        ( SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID
        EXCEPT 
          SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri2
          WHERE ri2.RecipeID = r2.RecipeID
        )
    AND NOT EXISTS
        ( SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri2
          WHERE ri2.RecipeID = r2.RecipeID
        EXCEPT 
          SELECT IngredientID, Quantity, UOM
          FROM RecipeIngredients AS ri1
          WHERE ri1.RecipeID = r1.RecipeID
        ) ;
; WITH cte AS
( SELECT RecipeID_1 = r.RecipeID, RecipeID_2 = ri.RecipeID, 
          ri.IngredientID, ri.Quantity, ri.UOM
  FROM Recipes AS r
    CROSS JOIN RecipeIngredients AS ri
)
, cte2 AS
( SELECT RecipeID_1, RecipeID_2,
         IngredientID, Quantity, UOM
  FROM cte
EXCEPT
  SELECT RecipeID_2, RecipeID_1,
         IngredientID, Quantity, UOM
  FROM cte
)

  SELECT RecipeID_1 = r1.RecipeID, RecipeID_2 = r2.RecipeID
  FROM Recipes AS r1
    JOIN Recipes AS r2
      ON r1.RecipeID < r2.RecipeID
EXCEPT 
  SELECT RecipeID_1, RecipeID_2
  FROM cte2
EXCEPT 
  SELECT RecipeID_2, RecipeID_1
  FROM cte2 ;
ALTER TABLE RecipeIngredients
  ADD ck AS CHECKSUM( IngredientID, Quantity, UOM )
    PERSISTED ;

CREATE INDEX ckecksum_IX
  ON RecipeIngredients
    ( RecipeID, ck ) ;

; WITH cte AS
( SELECT RecipeID,
         cka = CHECKSUM_AGG(ck)
  FROM RecipeIngredients AS ri
  GROUP BY RecipeID
)
SELECT RecipeID_1 = c1.RecipeID, RecipeID_2 = c2.RecipeID
FROM cte AS c1
  JOIN cte AS c2
    ON  c1.cka = c2.cka
    AND c1.RecipeID < c2.RecipeID  ;

Context

StackExchange Database Administrators Q#45302, answer score: 10

Revisions (0)

No revisions yet.