Sql-server – Data Warehouse: Auto Generate Slowly Changing Dimension Code For Any Table with Dynamic Sql

data-warehousedatabase-designslowly-changing-dimensionsql serversql-server-2016

Can someone code review to take OLTP/Stage data and transform it into Data Warehouse Kimball Slowly Changing Dimension Type 2? This code should be dynamic, and should be able to utilize any table as input.

I have been searching for code online similar to Ola Hallengren, Brent Ozar, or open source to formulate data into slowly changing dimensions. SQL Temporal tables only apply more to ETL Dates, rather than business dates (and I have a backlog of data). Some OLTP vendors don't allow temporal tables on their structures .

We have literally 100 hundreds of tables which need to be placed into Slowly Changing dimensions. The data have duplicates or repeated, other times columns need to be renamed/ or excluded from slowly changing dimension. Searching for an automatic code generator. It is taking a long time to code them all.

Can someone make this better, parametrize more, improve it, and enhance (hopefully open source this). Working on this for couple weeks. Feel free to try out/edit, or write a new answer below.

create procedure dbo.Dim_Type2_GenerateCode
    @TableNameSource varchar(255),
    @TableSourceLoadDate varchar(255) = 'LoadDatetime',

    @NaturalKey varchar(255),
    @NaturalBeginDateChange varchar(255),
    @NaturalEndDateChange varchar(255) = null,

    @RepeatedDataFlag bit = 0,
    @TempTableFlag bit = 0,
    @ColumnExcludeList varchar(max) = null

as

set nocount on

-- Purpose:  Generate slowly changing dimensions for any table
-- Note:
-- In case of Column Renames or joins between Staging and Dimension, utilize MapVw, eg FoodMapVw, CustomerMapVw, ProductMapVw
-- To Include only certain columns between Staging and Dimension, utilize MapVw, eg FoodMapVw, CustomerMapVw, ProductMapVw
-- To Load data into smaller temp table to prevent querying repeatedly, utilize @TempTableFlag = 1
-- Column Exclusion parameter can be added to exclude certain columns
-- Works with three table types below: Transaction Dates, Tables with Beg/EndDates already, Tables with RepeatedData

--      ProductId ProductName  TransactionDate      ProductId    ProductName   TransactionDate  EndEffDate           ProductId     ProductName   TransactionDate (Repeated)
--         1      Apple         4/1/2018              1             Apple         4/1/2018       4/3/2018               1              Apple           4/1/2018
--         1      Apple         4/3/2018              1             Apple         4/3/2018       4/7/2018               1              Apple           4/1/2018
--         1      Apple         4/7/2018              1             Apple         4/12/2018      12/31/999              1              Apple           4/2/2018
--                                                                                                                      1              Apple           4/3/2018

set @TableNameSource = right(@TableNameSource, len(@TableNameSource) - charindex('.', @TableNameSource))
declare @StageTableName varchar(255)
declare @DimTableName varchar(255)
set @TempTableFlag = (case when @NaturalBeginDateChange is null or @NaturalEndDateChange is null or @RepeatedDataFlag = 1 then 1 else 0 end)



DECLARE @ColumnExcludeTable table(ColumnExcludeValue varchar(500) not null);
insert into @ColumnExcludeTable(ColumnExcludeValue)
select ltrim(rtrim(value)) as ColumnExcludeValue from string_split(@ColumnExcludeList, ',');



declare @TempTableDeclare varchar(max) = 'create table #'+@TableNameSource+
'
( 
    ' + @TableNameSource + '_id bigint primary key identity(1,1),' + 
    (select STUFF((
    SELECT ', 
    '    
    + c.name + ' ' + 
case 
    when t.name like '%char%' then t.name + '(' + cast(c.max_length as varchar(10)) + ')' 
    when t.name like '%numeric%' or t.name like '%decimal%' then t.name + '(' + cast(c.precision as varchar(10)) + ',' + cast(c.scale as varchar(10)) + ')'
    else t.name
end
FROM .sys.columns c 
inner JOIN sys.types t
    on t.user_type_id = c.user_type_id
    and t.system_type_id = c.system_type_id
where c.object_id = object_id(@TableNameSource) and is_identity = 0
    and c.name not like '%@NaturalKey%' 
    and c.name not like '%EndEffDate%' 
    and c.name <> @NaturalBeginDateChange
    and c.name not in (select columnexcludetable.ColumnExcludeValue from @ColumnExcludeTable columnexcludetable)
FOR XML PATH(''), TYPE).value('.','nvarchar(max)'),1,2,''))
+ ' 
    ,BegEffDatetime datetime
    ,EndEffDatetime datetime
)'  


declare @TempTableCodeInsert varchar(max)=
    (select STUFF((
    SELECT ', 
    '    
    + QUOTENAME(c.name) 
FROM .sys.columns c 
inner JOIN sys.types t
    on t.user_type_id = c.user_type_id
    and t.system_type_id = c.system_type_id
where c.object_id = object_id(@TableNameSource) and is_identity = 0
    and c.name not like '%BegEffDate%' 
    and c.name not like '%EndEffDate%' 
    and c.name <> @NaturalBeginDateChange
    and c.name not in (select columnexcludetable.ColumnExcludeValue from @ColumnExcludeTable columnexcludetable)
FOR XML PATH(''), TYPE).value('.','nvarchar(max)'),1,2,''))

declare @TempTableCodeInsertNoLoadDate varchar(max) = (replace(@TempTableCodeInsert,quotename(@TableSourceLoadDate),''))
set @TempTableCodeInsertNoLoadDate = LEFT(@TempTableCodeInsertNoLoadDate, LEN(@TempTableCodeInsertNoLoadDate)-5)
--set @TempTableCodeInsertNoLoadDate = (replace(@TempTableCodeInsertNoLoadDate,quotename(@NaturalBeginDateChange),''))


declare @ColumnListNoPrimary varchar(max) = 
    (select STUFF((
    SELECT ', 
        '    
    + QUOTENAME(c.name) 
    FROM .sys.columns c 
    where c.object_id = object_id(@TableNameSource) and is_identity = 0
    FOR XML PATH(''), TYPE).value('.','nvarchar(max)'),1,2,''))


declare @ColumnListNoPrimaryNoTimeStamp varchar(max) = 
    (select STUFF((
    SELECT ', 
        '    
    + QUOTENAME(c.name) 
    FROM .sys.columns c 
    where c.object_id = object_id(@TableNameSource) and is_identity = 0
    and c.name <> @NaturalKey
    and c.name not like '%BeginEffDate%' 
    and c.name not like '%EndEffDate%' 
    and c.name not like '%CreateDatetime%' 
    and c.name not like '%UpdateDatetime%'
    and c.name not like '%Loaddate%'
    and c.name <> @NaturalBeginDateChange
    and c.name not in (select columnexcludetable.ColumnExcludeValue from @ColumnExcludeTable columnexcludetable)
    FOR XML PATH(''), TYPE).value('.','nvarchar(max)'),1,2,''))



declare @ColumnListNoPrimaryCompareCheck varchar(max) = 
    (select STUFF((
    SELECT ' or
        '    
    + 'stg.'+ QUOTENAME(c.name) + ' <> dim.' + QUOTENAME(c.name) 
    FROM .sys.columns c 
    where c.object_id = object_id(@TableNameSource) and is_identity = 0
    and c.name <> @NaturalKey
    and c.name not like '%BeginEffDate%' 
    and c.name not like '%EndEffDate%' 
    and c.name not like '%CreateDatetime%' 
    and c.name not like '%UpdateDatetime%'
    and c.name not like '%Loaddate%'
    and c.name <> @NaturalBeginDateChange
    and c.name not in (select columnexcludetable.ColumnExcludeValue from @ColumnExcludeTable columnexcludetable)
    FOR XML PATH(''), TYPE).value('.','nvarchar(max)'),1,3,''))



set @StageTableName = REPLACE(@TableNameSource,'Dim','Stage')
set @StageTableName = (case when @TempTableFlag = 1 then '#' + @StageTableName else 'dbo.' + @StageTableName end)

set @DimTableName = REPLACE(@TableNameSource,'Stage','Dim')
set @DimTableName = REPLACE(@DimTableName,'MapVw','')
set @DimTableName = (case when left(@DimTableName,3) <> 'Dim' then 'Dim_' + @DimTableName else @DimTableName end)

set @NaturalBeginDateChange = (case when @TempTableFlag = 1 then @NaturalBeginDateChange  else @NaturalBeginDateChange end)
declare @NaturalBeginDateChangeRename varchar(255) = (case when @TempTableFlag = 1 then 'BegEffDatetime'  else @NaturalBeginDateChange end)



declare @TableListGenerateCode nvarchar(max) = 

'create procedure dbo.' + @DimTableName +'Update
    @LoadDatetimeParam datetime
as

declare @NewDatetime datetime = getdate() 

'
+ 

'--Declare temp data
' + 
case when @TempTableFlag = 1 

then 
+ @TempTableDeclare + '

insert into ' + @StageTableName + 
'
(       
' + @TempTableCodeInsert + '
    ,BegEffDatetime
    ,EndEffDatetime
)' + 
case when @RepeatedDataFlag = 1
then 
'select 
    ' + replace(@TempTableCodeInsert,quotename(@TableSourceLoadDate),'min('+quotename(@TableSourceLoadDate)+')') + '
    ,min(' + @NaturalBeginDateChange + ') as BegEffDatetime  
    ,case when max(stg.' + @NaturalBeginDateChange + ') = (select max(substage.' + @NaturalBeginDateChange + ') from ' + @TableNameSource + ' substage where stg.'+ @NaturalKey + ' = substage.' + @NaturalKey + ') then ''12/31/9999'' else max(stg.' + @NaturalBeginDateChange + ') end as EndDate
from ' + @TableNameSource + ' stg
where LoadDatetime > @LoadDatetimeParam
group by ' + @TempTableCodeInsertNoLoaddate
when @RepeatedDataFlag = 0 
then
'
select
    distinct'
+  @TempTableCodeInsert + '
    ,' + @NaturalBeginDateChange + ' as BegEffDatetime
    ,ISNULL(Lead(' + @NaturalBeginDateChange + ' + 1) Over (partition by ' + @NaturalKey + ' order by ' + @NaturalBeginDateChange + ' asc), ''12/31/9999'') as EndEffDatetime'


+ '
from ' + @TableNameSource +
'
where LoadDatetime > @LoadDatetimeParam'
end
else 
''
end 
+ ' 
--End temp data section '  + '


--Begin Transaction

begin transaction

    --Close off existing records that have changed

    update dbo.' + @DimTableName + '
    set
        UpdateDatetime = @NewDatetime,
        EndEffDatetime = (select min(' + @NaturalBeginDateChangeRename + ') from ' + @StageTableName +' substage where stg.' + @NaturalKey + ' = substage.' + @NaturalKey + ')
    from dbo.' + @DimTableName + ' dim
    inner join ' + @StageTableName + ' stg
        on dim.' + @NaturalKey + ' = stg.' + @NaturalKey + '
        and dim.EndEffDatetime = ''12/31/9999''
    where 
    stg.LoadDatetime > @LoadDatetimeParam
    and ('
    + @ColumnListNoPrimaryCompareCheck +


    ')
    --Insert new updated records

    insert into dbo.' + @DimTableName + 
    '
    (   
        ' + @NaturalKey  + ',
        ' + @ColumnListNoPrimaryNoTimeStamp + ',' +
        '
        [CreateDatetime],
        [UpdateDatetime],
        [BegEffDatetime],
        [EndEffDatetime]
    )
    select  
        stg.' + @NaturalKey +',' 
        + replace(@ColumnListNoPrimaryNoTimeStamp,'[','stg.[') + 
        '
        ,@newdatetime as CreateDatetime
        ,@newdatetime as UpdateDatetime
        ,stg.BegEffDatetime as BegEffDatetime
        ,stg.EndEffDatetime as EndEffDatetime
    from '  + @StageTableName + ' stg
    inner join ' + @DimTableName + ' dim
        on dim.' + @NaturalKey + ' = stg.' + @NaturalKey + '
        and dim.UpdateDatetime = @NewDatetime
    --Verify Updates
    where 
        stg.LoadDatetime > @LoadDatetimeParam
        and ('
    + @ColumnListNoPrimaryCompareCheck + ')

    --Insert New Business Key records which do not exist

    UNION ALL
        select
        stg.' + @NaturalKey +',
        ' + replace(@ColumnListNoPrimaryNoTimeStamp,'[','stg.[') + 
        '
        ,@newdatetime as CreateDatetime
        ,@newdatetime as UpdateDatetime
        ,stg.BegEffDatetime as BegEffDatetime
        ,''12/31/9999'' as EndEffDate
    from '  + @StageTableName + ' stg
    left join dbo.' + @DimTableName + ' dim
        on dim.' + @NaturalKey + ' = stg.' + @NaturalKey + '
    where dim.' + @NaturalKey + ' is null   

commit transaction

--end code'

-- Print columns in three steps, print and select only can print first 8000 characters
print substring(@TableListGenerateCode,charindex('--Create procedure',@TableListGenerateCode),charindex('--Declare temp data',@TableListGenerateCode))
print substring(@TableListGenerateCode,charindex('--Declare temp data',@TableListGenerateCode),charindex('--End temp data',@TableListGenerateCode)-charindex('--Declare temp data',@TableListGenerateCode))
print substring(@TableListGenerateCode,charindex('--Begin transaction',@TableListGenerateCode),charindex('--Close off existing records',@TableListGenerateCode)-charindex('--Begin Transaction',@TableListGenerateCode))
print  '    ' + substring(@TableListGenerateCode,charindex('--Close off existing records',@TableListGenerateCode),charindex('--Insert new updated records',@TableListGenerateCode)-charindex('--Close off existing records',@TableListGenerateCode))
print  '    ' + substring(@TableListGenerateCode,charindex('--Insert new updated records',@TableListGenerateCode),charindex('--Verify updates',@TableListGenerateCode) -charindex('--Insert new updated records',@TableListGenerateCode) )
print  '    ' + substring(@TableListGenerateCode,charindex('--Verify updates',@TableListGenerateCode),charindex('--Insert New Business',@TableListGenerateCode) -charindex('--Verify updates',@TableListGenerateCode) )
print  '    ' + substring(@TableListGenerateCode,charindex('--Insert New Business',@TableListGenerateCode),charindex('--end code',@TableListGenerateCode)-charindex('--Insert New Business',@TableListGenerateCode))

Test Scenarios:
Personnel keeps mislabeling items at grocery store. Keeps track of history.

drop table [dbo].[Stage_Food]
create table [dbo].[Stage_Food]
(
    [Stagefoodid] int identity(1,1),
    [FoodNaturalId] [int] NOT NULL,
    [FoodName] [varchar](255) NULL,
    [FoodCategory] [varchar](255) NULL,
    [FoodTransactionDate] datetime,
    [LoadDatetime] datetime
    primary key clustered ([Stagefoodid] ASC)
)

drop table dbo.[Dim_food]
create table [dbo].[Dim_food]
(
    [DimFoodId] [int] IDENTITY(1,1) NOT NULL,
    [FoodNaturalId] [int] NULL,
    [FoodName] [varchar](255) NULL,
    [FoodCategory] [varchar](255) NULL,
    [BegEffDatetime] [datetime] NULL,
    [EndEffDatetime] [datetime] NULL,
    [CreateDatetime] [datetime] NULL,
    [Updatedatetime] [datetime] NULL
    primary key clustered ( [DimFoodId] ASC)
) 


exec dbo.Dim_Type2_GenerateCode
    @TableNameSource = 'Stage_Food',
    @TableSourceLoadDate = 'LoadDatetime',

    @NaturalKey = 'FoodNaturalId',
    @NaturalBeginDateChange = 'FoodTransactionDate'



insert into dbo.Stage_Food
values (1,'Apple','Vegetable','5/2/2018','5/4/2018')

exec dbo.Dim_FoodUpdate '5/3/2018'

select * from dbo.Dim_food


insert into dbo.Stage_Food
values (2,'Bread','Meat','5/3/2018','5/5/2018')

exec dbo.Dim_FoodUpdate '5/4/2018'

select * from dbo.Dim_food

insert into dbo.Stage_Food
values (1,'Apple','Candy','5/8/2018','5/9/2018')

dbo.Dim_FoodUpdate '5/8/2018'

select * from dbo.Dim_food


insert into dbo.Stage_Food
values (2,'Bread','Grain','5/8/2018','5/10/2018')


dbo.Dim_FoodUpdate '5/9/2018'

select * from dbo.Dim_food


---------------------------------------
-- Test2 Repeated data flag
---------------------------------------


exec dbo.Dim_Type2_GenerateCode
    @TableNameSource = 'Stage_Food',
    @TableSourceLoadDate = 'LoadDatetime',

    @NaturalKey = 'FoodNaturalId',
    @NaturalBeginDateChange = 'FoodTransactionDate',
    @RepeatedDataFlag = 1,
    @TempTableFlag  = 1


insert into dbo.Stage_Food
values (1,'Apple','Vegetable','5/2/2018','5/4/2018'),
(1,'Apple','Vegetable','5/2/2018','5/4/2018')

exec dbo.Dim_FoodUpdate '5/3/2018'

select * from dbo.Dim_food


insert into dbo.Stage_Food
values (2,'Bread','Meat','5/2/2018','5/5/2018'),
(2,'Bread','Meat','5/3/2018','5/5/2018')



exec dbo.Dim_FoodUpdate '5/4/2018'

select * from dbo.Dim_food

insert into dbo.Stage_Food
values (1,'Apple','Candy','5/6/2018','5/9/2018'),
 (1,'Apple','Candy','5/7/2018','5/9/2018'),
(1,'Apple','Candy','5/8/2018','5/9/2018')

dbo.Dim_FoodUpdate '5/8/2018'

select * from dbo.Dim_food


insert into dbo.Stage_Food
values (2,'Bread','Grain','5/8/2018','5/10/2018')


dbo.Dim_FoodUpdate '5/9/2018'

select * from dbo.Dim_food

You can test out more items, temp table flags, column exclusion list, mapping views

Best Answer

There are tons of ETL frameworks out there for this purpose, although I suspect most of them are proprietary, or specific to one type of data source.

You may want to look into Biml, which allows you to dynamically build ETL flows in SSIS.