I have some tables in the staging area of a data warehouse that I'm filling with data from some flat, comma-delimited text extracts from another system. When the data comes in the hierarchy of parents for each element is presented in columns labelled ParentCode01
… ParentCode11
where the immediate parent of the current node is in ParentCode01
and the top level parent could be in any column (ParentCode11
is mostly NULL
).
CREATE TABLE CostCentreHierarchy (
CostCentreCode varchar(10) NOT NULL CONSTRAINT pCostCentreCode_CostCentreHierarchy PRIMARY KEY,
CostCentreDesc varchar(100),
ValidFromDate varchar(10),
ValidToDate varchar(10),
ParentCode01 varchar(15),
ParentDesc01 varchar(100),
ParentCode02 varchar(15),
ParentDesc02 varchar(100),
ParentCode03 varchar(15),
ParentDesc03 varchar(100),
ParentCode04 varchar(15),
ParentDesc04 varchar(100),
ParentCode05 varchar(15),
ParentDesc05 varchar(100),
ParentCode06 varchar(15),
ParentDesc06 varchar(100),
ParentCode07 varchar(15),
ParentDesc07 varchar(100),
ParentCode08 varchar(15),
ParentDesc08 varchar(100),
ParentCode09 varchar(15),
ParentDesc09 varchar(100),
ParentCode10 varchar(15),
ParentDesc10 varchar(100),
ParentCode11 varchar(15),
ParentDesc11 varchar(100));
INSERT INTO CostCentreHierarchy
(CostCentreCode, CostCentreDesc, ValidFromDate, ValidToDate, ParentCode01, ParentDesc01, ParentCode02, ParentDesc02,
ParentCode03, ParentDesc03, ParentCode04, ParentDesc04, ParentCode05, ParentDesc05, ParentCode06, ParentDesc06)
VALUES
('0002000000', '0002000000', '01.07.1950', '31.12.9999', 'YA0201', 'YA0201', 'YA0200', 'YA0200', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000001', '0002000001', '01.07.1950', '31.12.9999', 'YA0301', 'YA0301', 'YA0300', 'YA0300', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000002', '0002000002', '01.07.1950', '31.12.9999', 'XA0101', 'XA0101', 'XA0100', 'XA0100', 'XA0000', 'Unit 3 - Admin', 'X00000', 'Branch B - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000003', '0002000003', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000004', '0002000004', '01.07.1950', '31.12.9999', 'YB0999', 'YB0999', 'YB0900', 'YB0900', 'YB0000', 'Unit 2 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000005', '0002000005', '01.07.1950', '31.12.9999', 'YA0101', 'YA0101', 'YA0100', 'YA0100', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000006', '0002000006', '01.07.1950', '31.12.9999', 'XA0999', 'XA0999', 'XA0900', 'XA0900', 'XA0000', 'Unit 3 - Admin', 'X00000', 'Branch B - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000007', '0002000007', '01.07.1950', '31.12.9999', 'YA0302', 'YA0302', 'YA0300', 'YA0300', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000008', '0002000008', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company'),
('0002000009', '0002000009', '01.07.1950', '31.12.9999', 'YA0999', 'YA0999', 'YA0900', 'YA0900', 'YA0000', 'Unit 1 - Admin', 'Y00000', 'Branch A - Admin', '1A', 'ADMINISTERED REPORTING', '1', 'Company');
When I load this data into my data warehouse I load it into a table that has a parent-child relationship like this:
CREATE SEQUENCE CostCentreID_Sequence AS integer START WITH 1 NO CYCLE NO CACHE;
CREATE TABLE CostCentre (
CostCentreID int CONSTRAINT DF_Sequence_CostCentreID_CostCentre DEFAULT NEXT VALUE FOR CostCentreID_Sequence NOT NULL,
CostCentreCode varchar(15) Constraint uCostCentreCode_CostCentre Unique NOT NULL,
CostCentreDesc varchar(100) NOT NULL,
ValidFromDate date,
ValidToDate date,
ParentID int CONSTRAINT fParentID_CostCentre REFERENCES CostCentre(CostCentreID),
CONSTRAINT pCostCentreID_CostCentre PRIMARY KEY CLUSTERED (CostCentreID))
With (Data_Compression = Row);
CREATE INDEX iParentID ON CostCentre(ParentID);
So to get to that format I have a query which grabs the distinct values of CostCentreCode
and CostCentreDesc
by generating a UNION
of the values in each level, like this:
WITH unflattened AS (
SELECT CCH.CostCentreCode,
CCH.CostCentreDesc,
CCH.ValidFromDate,
CCH.ValidToDate,
COALESCE(CCH.ParentCode01,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode01,
CCH.ParentDesc01,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode02,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode02,
CCH.ParentDesc02,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode03,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode03,
CCH.ParentDesc03,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode04,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode04,
CCH.ParentDesc04,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode05,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode05,
CCH.ParentDesc05,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode06,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode06,
CCH.ParentDesc06,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode07,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode07,
CCH.ParentDesc07,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode08,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode08,
CCH.ParentDesc08,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode09,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode09,
CCH.ParentDesc09,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode10,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode10,
CCH.ParentDesc10,
NULL AS ValidFromDate,
NULL AS ValidToDate,
COALESCE(CCH.ParentCode11,'') AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH
UNION
SELECT CCH.ParentCode11,
CCH.ParentDesc11,
NULL AS ValidFromDate,
NULL AS ValidToDate,
'' AS ParentCostCentreCode
FROM CostCentreHierarchy AS CCH)
SELECT u.CostCentreCode,
u.CostCentreDesc,
u.ValidFromDate,
u.ValidToDate,
IIF(unflattened.ParentCostCentreCode = '', NULL, u.ParentCostCentreCode) AS ParentCostCentreCode
FROM unflattened AS u
WHERE u.CostCentreCode <> '';
Maybe I'm overthinking things but I don't really like this because it's currently doing 12 table scans to get the result. The optimiser is also currently choosing to sort the data and do merge joins to achieve the UNION
but I'm not worried at this stage about it's choice of doing that.
Is there another way that I could do this which doesn't result in additional table scans every time the hierarchy gets one level deeper?
Best Answer
You can use a cross apply and the table value constructor.
That is because union is trying to remove duplicate rows. If you don't need that you should use
union all
instead.If removing duplicates is what you want then you should add a
distinct
the the query above.