Sql-server – Select distinct values in column with condition

group byquerysql server

I have the products table as follows:

ProductID | ItemId | Color
---------------------------
1           X        5N
2           X        10N
3           X        2N
4           Y        12S  

and a second one joined by ItemId

ID | ItemId | Group | Color | Value
-------------------------------------
1    X        T       null    5
2    X        T       5N      7
3    X        M       2N      9
4    X        M       null    6
5    Y        M       null    12

I'd like to write a select that will return distinct Group values. As you can see there two groups with T value and ItemId is X, so in this case if color is the same in product and in other table then value with same color must be used.

If there is no such color, then take null value. What is important is that all distinct groups for each product must be fetched. The only thing is the priority for the same groups if color match.

Desired result:

ProductID | ItemId | Group | Color | Value
-------------------------------------------
1           X        T       5N      7
1           X        M       5N      6
2           X        T       10N     5
2           X        M       10N     6
3           X        T       2N      5
3           X        M       2N      9
4           Y        M       12S     12  

DDL:

create table ProductPrices
(
  Id bigint not null
    constraint pk_product_prices
      primary key,
  ItemId varchar(50) not null,
  Group varchar(10) not null,
  Value decimal(18,4) not null,
  Color varchar(50)
)
go

create table Products
(
  ProductId nvarchar(70) not null
    constraint PK_table_4
      primary key,
  ItemId nvarchar(50) not null,
  Color nvarchar(50) not null
)
go

Best Answer

Solution 1

The idea is to find distinct groups for each product, then the highest priority match calculated using a CASE expression:

SELECT
    P.ProductId,
    P.ItemId,
    G.[Group],
    P.Color,
    V.[Value]
FROM dbo.Products AS P
CROSS APPLY 
(
    -- Find groups for the current ItemId
    SELECT DISTINCT
        PP.[Group]
    FROM dbo.ProductPrices AS PP
    WHERE
        PP.ItemId = P.ItemId
) AS G
CROSS APPLY 
(
    -- Find the highest-priority Value
    SELECT TOP (1)
        PP2.[Value]
    FROM dbo.ProductPrices AS PP2
    WHERE
        PP2.ItemId = P.ItemId
        AND PP2.[Group] = G.[Group]
    ORDER BY
        CASE
            -- Colour match is best
            WHEN PP2.Color = P.Color THEN 1
            -- Otherwise accept NULL
            WHEN PP2.Color IS NULL THEN 2
            ELSE 3
        END ASC
) AS V
ORDER BY 
    P.ProductId,
    P.ItemId,
    G.[Group] DESC;

Solution 2

This joins the tables in the natural way, then uses a window function to find the highest priority match for each group. This code only accesses each base table once:

WITH
    Joined AS
    (
        SELECT
            P.ProductId,
            P.ItemId,
            PP.[Group],
            Color = ISNULL(PP.Color, P.Color),
            PP.[Value],
            rnk = 
                CASE
                    WHEN PP.Color = P.Color THEN 1
                    WHEN PP.Color IS NULL THEN 2
                    ELSE 3
                END
        FROM dbo.Products AS P
        JOIN dbo.ProductPrices AS PP
            ON PP.ItemId = P.ItemId
    ),
    Ranked AS
    (
        SELECT
            *,
            rn = 
                ROW_NUMBER() OVER (
                    PARTITION BY Joined.ItemId, Joined.[Group], Joined.Color
                    ORDER BY Joined.rnk ASC)
        FROM Joined
    )
SELECT
    R.ProductId,
    R.ItemId,
    R.[Group],
    R.Color,
    R.[Value]
FROM Ranked AS R
WHERE 
    R.rn = 1
ORDER BY 
    R.ProductId,
    R.ItemId,
    R.[Group] DESC;

Solution 3

This is the same logic as solution 2, but uses a subquery and a more compact syntax:

SELECT
    Q1.ProductId,
    Q1.ItemId,
    Q1.[Group],
    Q1.Color,
    Q1.[Value]
FROM 
(
    SELECT
        P.ProductId,
        P.ItemId,
        PP.[Group],
        Color = ISNULL(PP.Color, P.Color),
        PP.Value,
        rn = ROW_NUMBER() OVER (
            PARTITION BY P.ItemId, PP.[Group], ISNULL(PP.Color, P.Color)
            ORDER BY
                CASE
                    WHEN PP.Color = P.Color THEN 1
                    WHEN PP.Color IS NULL THEN 2
                    ELSE 3
                END)
    FROM dbo.Products AS P
    JOIN dbo.ProductPrices AS PP
        ON PP.ItemId = P.ItemId
) AS Q1
WHERE
    Q1.rn = 1
ORDER BY
    Q1.ProductId,
    Q1.ItemId,
    Q1.[Group] DESC;

Solution 4

With the following index on dbo.ProductPrices:

CREATE INDEX [IX dbo.ProductPrices ItemId, Group (Color)]
ON dbo.ProductPrices
(
    ItemId,
    [Group],
    Color
)
INCLUDE
(
    [Value]
);

The following solution can perform well:

SELECT
    P.ProductId,
    P.ItemId,
    G.[Group],
    P.Color,
    [Value] =
        IIF
        (
            EXISTS
            (
                SELECT
                    1
                FROM dbo.ProductPrices AS PP
                WHERE
                    PP.ItemId = CONVERT(varchar(50), P.ItemId)
                    AND PP.[Group] = G.[Group]
                    AND PP.Color = CONVERT(varchar(50), P.Color)
            ),
            (
                SELECT TOP (1)
                    PP.[Value]
                FROM dbo.ProductPrices AS PP
                WHERE
                    PP.ItemId = CONVERT(varchar(50), P.ItemId)
                    AND PP.[Group] = G.[Group]
                    AND PP.Color = CONVERT(varchar(50), P.Color)
            ),
            (
                SELECT TOP (1)
                    PP.[Value]
                FROM dbo.ProductPrices AS PP
                WHERE
                    PP.ItemId = CONVERT(varchar(50), P.ItemId)
                    AND PP.[Group] = G.[Group]
                    AND PP.Color IS NULL
            )
        )
FROM dbo.Products AS P
CROSS APPLY 
(
    SELECT DISTINCT TOP (9223372036854775807)
        PP.ItemId,
        PP.[Group]
    FROM dbo.ProductPrices AS PP
    WHERE
        PP.ItemId = CONVERT(varchar(50), P.ItemId)
) AS G
ORDER BY 
    P.ProductId,
    P.ItemId,
    G.[Group] DESC;

Demos

db<>fiddle demo

As a side note, ItemId is defined as nvarchar(50) in one table, and varchar(50) in the other. Same with the Color column. These problems should be fixed, if possible.