patternsqlMinor
Loading Data Warehouse with Dynamic SQL
Viewed 0 times
warehousewithsqlloadingdynamicdata
Problem
For a data warehousing project I ran into the following:
Custom fields that users can create, modify and delete, that should be loaded into the data warehouse as they are when the ETL happens.
On the profiling part of the area in the database that holds this data I discovered that theses fields are not columns in tables, but are rows in a "field definitions" table and rows in a "field values" table. Another aspect is that, in the actual source data, there are hundreds of columns that belong to several dozen groups.
So far the only way I have found do deal with this is using a cursor and dynamic SQL to create my dimensions from this data. I am interested if there is any other way to do this, and what others think of the code I have written to handle this.
Update: The datawarehousing project uses the MS BI stack (ssis/ssas/ssrs) with this solution as a script in the ssis ETL.
The first block is a script to create demonstration tables with values as they are on the source system, the second block is the code I am using to generate the desired result. This code actually does everything that is required but as you can see it is rather bulky. Any thought on improvements (or a completely different method) are appreciated.
This creates a situation like the source data:
```
CREATE TABLE groups (
group_id INT identity(1, 1) PRIMARY KEY,
group_name NVARCHAR(100)
)
INSERT groups (group_name)
VALUES ('group_1'),
('group_2'),
('group_3'),
('group_4'),
('group_5')
CREATE TABLE tickets (
ticket_id INT identity(1, 1) PRIMARY KEY,
group_id INT FOREIGN KEY REFERENCES groups(group_id) NOT NULL,
some_foreign_key INT NULL,
some_timestamp DATETIME NULL
)
WHILE SCOPE_IDENTITY() < 50
BEGIN
INSERT tickets (
group_id,
some_foreign_key,
some_timestamp
)
VALUES
(1,CAST((RAND() 100) AS INT),DATEADD(DD, CAST((RAND() 100) AS INT), GETDATE())),
(2,CAST((RAND() * 100) AS INT),DATEAD
Custom fields that users can create, modify and delete, that should be loaded into the data warehouse as they are when the ETL happens.
On the profiling part of the area in the database that holds this data I discovered that theses fields are not columns in tables, but are rows in a "field definitions" table and rows in a "field values" table. Another aspect is that, in the actual source data, there are hundreds of columns that belong to several dozen groups.
So far the only way I have found do deal with this is using a cursor and dynamic SQL to create my dimensions from this data. I am interested if there is any other way to do this, and what others think of the code I have written to handle this.
Update: The datawarehousing project uses the MS BI stack (ssis/ssas/ssrs) with this solution as a script in the ssis ETL.
The first block is a script to create demonstration tables with values as they are on the source system, the second block is the code I am using to generate the desired result. This code actually does everything that is required but as you can see it is rather bulky. Any thought on improvements (or a completely different method) are appreciated.
This creates a situation like the source data:
```
CREATE TABLE groups (
group_id INT identity(1, 1) PRIMARY KEY,
group_name NVARCHAR(100)
)
INSERT groups (group_name)
VALUES ('group_1'),
('group_2'),
('group_3'),
('group_4'),
('group_5')
CREATE TABLE tickets (
ticket_id INT identity(1, 1) PRIMARY KEY,
group_id INT FOREIGN KEY REFERENCES groups(group_id) NOT NULL,
some_foreign_key INT NULL,
some_timestamp DATETIME NULL
)
WHILE SCOPE_IDENTITY() < 50
BEGIN
INSERT tickets (
group_id,
some_foreign_key,
some_timestamp
)
VALUES
(1,CAST((RAND() 100) AS INT),DATEADD(DD, CAST((RAND() 100) AS INT), GETDATE())),
(2,CAST((RAND() * 100) AS INT),DATEAD
Solution
I noticed that you have a Select nested inside of a select twice, which looks kind of ugly...
This also brings up a formatting issue where there are no new lines in between these two variable declaration blocks which makes it a little hard to read.
but what you should do is use a Common Table Expression(CTE) on these or better yet since the inner most Select statement on both is the same, you should create a temporary table and select from it.
CTE:
Temp Table:
And then your Query would look like this
Looks like you could do something similar with the
DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM (
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
) v
FOR XML PATH(''),
TYPE
).value('.', 'NVARCHAR(MAX)'), 1, 1, '')
DECLARE @cols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',' + QUOTENAME(v.field_name)
FROM (
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
) v
FOR XML PATH(''),
TYPE
).value('.', 'NVARCHAR(MAX)'), 1, 1, '')This also brings up a formatting issue where there are no new lines in between these two variable declaration blocks which makes it a little hard to read.
but what you should do is use a Common Table Expression(CTE) on these or better yet since the inner most Select statement on both is the same, you should create a temporary table and select from it.
CTE:
WITH tableV (field_name)
AS
(
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
)
DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')Temp Table:
CREATE TABLE #tableV (field_name);
INSERT INTO #tableV
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)And then your Query would look like this
DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM #tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')
DECLARE @cols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',' + QUOTENAME(v.field_name)
FROM #tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')Looks like you could do something similar with the
@sql variable as well, even if only to make it more readable and to nest it less.Code Snippets
DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM (
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
) v
FOR XML PATH(''),
TYPE
).value('.', 'NVARCHAR(MAX)'), 1, 1, '')
DECLARE @cols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',' + QUOTENAME(v.field_name)
FROM (
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
) v
FOR XML PATH(''),
TYPE
).value('.', 'NVARCHAR(MAX)'), 1, 1, '')WITH tableV (field_name)
AS
(
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)
)
DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')CREATE TABLE #tableV (field_name);
INSERT INTO #tableV
SELECT d.field_name
FROM tickets t
INNER JOIN groups g
ON t.group_id = g.group_id
INNER JOIN custom_field_entries e
ON t.ticket_id = e.ticket_id
INNER JOIN custom_field_definitions d
ON d.field_id = e.field_id
WHERE (g.group_name = @group)DECLARE @maxcols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',MAX(' + QUOTENAME(v.field_name) + ') AS ' + QUOTENAME(v.field_name)
FROM #tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')
DECLARE @cols AS NVARCHAR(MAX) = STUFF((
SELECT DISTINCT ',' + QUOTENAME(v.field_name)
FROM #tableV
FOR XML PATH(''),TYPE).value('.', 'NVARCHAR(MAX)'), 1, 1, '')Context
StackExchange Code Review Q#56551, answer score: 2
Revisions (0)
No revisions yet.