Sql-server – Unflatten hierarchical data in SQL Server

hierarchyperformancequery-performancesql server

I have some tables in the staging area of a data warehouse that I'm filling with data from some flat, comma-delimited text extracts from another system. When the data comes in the hierarchy of parents for each element is presented in columns labelled ParentCode01ParentCode11 where the immediate parent of the current node is in ParentCode01 and the top level parent could be in any column (ParentCode11 is mostly NULL).

CREATE TABLE CostCentreHierarchy (
    CostCentreCode varchar(10) NOT NULL CONSTRAINT pCostCentreCode_CostCentreHierarchy PRIMARY KEY,
    CostCentreDesc varchar(100),
    ValidFromDate varchar(10),
    ValidToDate varchar(10),
    ParentCode01 varchar(15),
    ParentDesc01 varchar(100),
    ParentCode02 varchar(15),
    ParentDesc02 varchar(100),
    ParentCode03 varchar(15),
    ParentDesc03 varchar(100),
    ParentCode04 varchar(15),
    ParentDesc04 varchar(100),
    ParentCode05 varchar(15),
    ParentDesc05 varchar(100),
    ParentCode06 varchar(15),
    ParentDesc06 varchar(100),
    ParentCode07 varchar(15),
    ParentDesc07 varchar(100),
    ParentCode08 varchar(15),
    ParentDesc08 varchar(100),
    ParentCode09 varchar(15),
    ParentDesc09 varchar(100),
    ParentCode10 varchar(15),
    ParentDesc10 varchar(100),
    ParentCode11 varchar(15),
    ParentDesc11 varchar(100));

INSERT INTO CostCentreHierarchy 
    (CostCentreCode, CostCentreDesc, ValidFromDate, ValidToDate, ParentCode01, ParentDesc01, ParentCode02, ParentDesc02, 
     ParentCode03, ParentDesc03, ParentCode04, ParentDesc04, ParentCode05, ParentDesc05, ParentCode06, ParentDesc06)
VALUES 
('0002000000', '0002000000', '01.07.1950', '31.12.9999', 'YA0201', 'YA0201', 'YA0200', 'YA0200', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000001', '0002000001', '01.07.1950', '31.12.9999', 'YA0301', 'YA0301', 'YA0300', 'YA0300', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000002', '0002000002', '01.07.1950', '31.12.9999', 'XA0101', 'XA0101', 'XA0100', 'XA0100', 'XA0000', 'Unit 3 - Admin', 'X00000', 'Branch B - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000003', '0002000003', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000004', '0002000004', '01.07.1950', '31.12.9999', 'YB0999', 'YB0999', 'YB0900', 'YB0900', 'YB0000', 'Unit 2 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000005', '0002000005', '01.07.1950', '31.12.9999', 'YA0101', 'YA0101', 'YA0100', 'YA0100', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000006', '0002000006', '01.07.1950', '31.12.9999', 'XA0999', 'XA0999', 'XA0900', 'XA0900', 'XA0000', 'Unit 3 - Admin', 'X00000', 'Branch B - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000007', '0002000007', '01.07.1950', '31.12.9999', 'YA0302', 'YA0302', 'YA0300', 'YA0300', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000008', '0002000008', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000009', '0002000009', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company');

When I load this data into my data warehouse I load it into a table that has a parent-child relationship like this:

CREATE SEQUENCE CostCentreID_Sequence AS integer START WITH 1 NO CYCLE NO CACHE;
CREATE TABLE CostCentre (
    CostCentreID int CONSTRAINT DF_Sequence_CostCentreID_CostCentre DEFAULT NEXT VALUE FOR CostCentreID_Sequence NOT NULL,
    CostCentreCode varchar(15) Constraint uCostCentreCode_CostCentre Unique NOT NULL,
    CostCentreDesc varchar(100) NOT NULL,
    ValidFromDate date,
    ValidToDate date,
    ParentID int CONSTRAINT fParentID_CostCentre REFERENCES CostCentre(CostCentreID),
    CONSTRAINT pCostCentreID_CostCentre PRIMARY KEY CLUSTERED (CostCentreID))
    With (Data_Compression = Row);
CREATE INDEX iParentID ON CostCentre(ParentID);

So to get to that format I have a query which grabs the distinct values of CostCentreCode and CostCentreDesc by generating a UNION of the values in each level, like this:

WITH unflattened AS (
    SELECT CCH.CostCentreCode,
        CCH.CostCentreDesc,
        CCH.ValidFromDate,
        CCH.ValidToDate,
        COALESCE(CCH.ParentCode01,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode01,
        CCH.ParentDesc01,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode02,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode02,
        CCH.ParentDesc02,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode03,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode03,
        CCH.ParentDesc03,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode04,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode04,
        CCH.ParentDesc04,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode05,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode05,
        CCH.ParentDesc05,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode06,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode06,
        CCH.ParentDesc06,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode07,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode07,
        CCH.ParentDesc07,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode08,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode08,
        CCH.ParentDesc08,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode09,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode09,
        CCH.ParentDesc09,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode10,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode10,
        CCH.ParentDesc10,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        COALESCE(CCH.ParentCode11,'') AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH
    UNION
    SELECT CCH.ParentCode11,
        CCH.ParentDesc11,
        NULL AS ValidFromDate,
        NULL AS ValidToDate,
        '' AS ParentCostCentreCode
        FROM CostCentreHierarchy AS CCH)
SELECT u.CostCentreCode,
    u.CostCentreDesc,
    u.ValidFromDate,
    u.ValidToDate,
    IIF(unflattened.ParentCostCentreCode = '', NULL, u.ParentCostCentreCode) AS ParentCostCentreCode
  FROM unflattened AS u
  WHERE u.CostCentreCode <> '';

Maybe I'm overthinking things but I don't really like this because it's currently doing 12 table scans to get the result. The optimiser is also currently choosing to sort the data and do merge joins to achieve the UNION but I'm not worried at this stage about it's choice of doing that.

Is there another way that I could do this which doesn't result in additional table scans every time the hierarchy gets one level deeper?

Best Answer

You can use a cross apply and the table value constructor.

select U.CostCentreCode,
       U.CostCentreDesc,
       C.ValidFromDate,
       C.ValidToDate,
       coalesce(U.ParentCostCentreCode, '') as ParentCostCentreCode
from dbo.CostCentreHierarchy as C
  cross apply (values(C.CostCentreCode, C.CostCentreDesc, C.ParentCode01),
                     (C.ParentCode01, C.ParentDesc01, C.ParentCode02),
                     (C.ParentCode02, C.ParentDesc02, C.ParentCode03),
                     (C.ParentCode03, C.ParentDesc03, C.ParentCode04),
                     (C.ParentCode04, C.ParentDesc04, C.ParentCode05),
                     (C.ParentCode05, C.ParentDesc05, C.ParentCode06),
                     (C.ParentCode06, C.ParentDesc06, C.ParentCode07),
                     (C.ParentCode07, C.ParentDesc07, C.ParentCode08),
                     (C.ParentCode08, C.ParentDesc08, C.ParentCode09),
                     (C.ParentCode09, C.ParentDesc09, C.ParentCode10),
                     (C.ParentCode10, C.ParentDesc10, C.ParentCode11),
                     (C.ParentCode11, C.ParentDesc11, '')
              ) as U(CostCentreCode, CostCentreDesc, ParentCostCentreCode)
where U.CostCentreCode <> '';

enter image description here

The optimiser is also currently choosing to sort the data and do merge joins to achieve the UNION

That is because union is trying to remove duplicate rows. If you don't need that you should use union all instead.

If removing duplicates is what you want then you should add a distinct the the query above.

select distinct 
       U.CostCentreCode,
       U.CostCentreDesc,
       C.ValidFromDate,
....