Sql-server – Evaluating most frequent values of columns in a single query

sql serversql-server-2008

I have a table describing my application users with some details such as name, surname, birth date, nationality, email, etc…

I would like to known for each property, and for each user category, the most frequent value and the percentage of occurence.

For instance :

create table test ( userId int identity(1,1), 
                    categoryId int, 
                    name varchar(50), 
                    surname varchar(50))

insert into test(categoryId, name, surname)
values  (1, 'John', 'Locke'),
        (1, 'John', 'Millar'),
        (1, 'James', 'Mill'),
        (1, 'John Stuart', 'Mill'),
        (2, 'Thomas', 'Bayes'),
        (2, 'Laurent', 'Schwartz'),
        (2, 'Herrmann Amandus', 'Schwartz'),
        (2, 'Thomas', 'Simpson'),
        (2, 'Leonhard', 'Euler')

Result should be:

+------------+-------+--------+---------+----------+------------+
| categoryId | total |  name  | namePct | surname  | surnamePct |
+------------+-------+--------+---------+----------+------------+
|          1 |     4 | John   |    0.50 | Mill     |       0.50 |
|          2 |     5 | Thomas |    0.40 | Schwartz |       0.40 |
+------------+-------+--------+---------+----------+------------+

For this simple example, I can figure how I can achieve that, with a query like:

select  t.categoryId, 
        t.total, 
        n.name, 
        1. * n.total / t.total as namePct,
        sn.surname,
        1. * sn.total / t.total as surnamePct
from (
    select categoryId, count(*) as total
    from test
    group by categoryId
    ) t
join (
        select categoryId, name, total
        from (
            select categoryId, name, total, row_number() over(partition by categoryId order by total desc) as rn
            from (
                select categoryId, name, count(*) as total
                from test
                group by categoryId, name
                ) t
            ) t
        where rn = 1
        ) n on t.categoryId = n.categoryId
join (
        select categoryId, surname, total
        from (
            select categoryId, surname, total, row_number() over(partition by categoryId order by total desc) as rn
            from (
                select categoryId, surname, count(*) as total
                from test
                group by categoryId, surname
                ) t
            ) t
        where rn = 1
        ) sn on t.categoryId = sn.categoryId

But, in my real use case my table has millions of rows, hundreds of categories and a dozen of attributes.

Is there a way to make the query simpler and more efficient (ie. without a bunch of sub select for each attribute) ?

I'm currently using SQL Server 2008, but answers using more recent versions are welcome.

Best Answer

You can use the windowed version of function COUNT(), split by each category with PARTITION BY to get counts and totals without a subquery (note the lack of GROUP BY):

SELECT
    T.categoryId,

    T.name,
    NameOccurencies = COUNT(T.name) OVER (PARTITION BY T.categoryId, T.name),
    NameTotals = COUNT(T.name) OVER (PARTITION BY T.categoryId),

    T.surname,
    SurnameOccurencies = COUNT(T.surname) OVER (PARTITION BY T.categoryId, T.surname),
    SurnameTotals = COUNT(T.surname) OVER (PARTITION BY T.categoryId)
FROM
    #test AS T

Result:

categoryId  name                NameOccurencies NameTotals  surname             SurnameOccurencies  SurnameTotals
1           John                2               4           Locke               1                   4
1           John Stuart         1               4           Mill                2                   4
1           James               1               4           Mill                2                   4
1           John                2               4           Millar              1                   4
2           Thomas              2               5           Bayes               1                   5
2           Leonhard            1               5           Euler               1                   5
2           Herrmann Amandus    1               5           Schwartz            2                   5
2           Laurent             1               5           Schwartz            2                   5
2           Thomas              2               5           Simpson             1                   5

Then you can use this result to get each percentage simply dividing the occurencies by each total. You can also calculate the best (most frequent) name and surname using a ROW_NUMBER() on this step:

;WITH Totals AS
(
    SELECT
        T.categoryId,

        T.name,
        NameOccurencies = COUNT(T.name) OVER (PARTITION BY T.categoryId, T.name),
        NameTotals = COUNT(T.name) OVER (PARTITION BY T.categoryId),

        T.surname,
        SurnameOccurencies = COUNT(T.surname) OVER (PARTITION BY T.categoryId, T.surname),
        SurnameTotals = COUNT(T.surname) OVER (PARTITION BY T.categoryId)
    FROM
        #test AS T
)
SELECT
    T.categoryId,

    T.name,
    NamePercentage = T.NameOccurencies * 1.0 / NULLIF(T.NameTotals, 0),
    NameMostFrequentRanking = ROW_NUMBER() OVER (
        PARTITION BY
            T.categoryId
        ORDER BY 
            T.NameOccurencies * 1.0 / NULLIF(T.NameTotals, 0) DESC), -- NamePercentage

    T.surname,
    SurnamePercentage = T.SurnameOccurencies * 1.0 / NULLIF(T.SurnameTotals, 0),
    SurnameMostFrequentRanking = ROW_NUMBER() OVER (
        PARTITION BY
            T.categoryId
        ORDER BY 
            T.SurnameOccurencies * 1.0 / NULLIF(T.SurnameTotals, 0) DESC) -- SurnamePercentage
FROM
    Totals AS T

Result:

categoryId  name                NamePercentage  NameMostFrequentRanking surname     SurnamePercentage   SurnameMostFrequentRanking
1           John Stuart         0.250000000000  3                       Mill        0.500000000000      1
1           James               0.250000000000  4                       Mill        0.500000000000      2
1           John                0.500000000000  1                       Millar      0.250000000000      3
1           John                0.500000000000  2                       Locke       0.250000000000      4
2           Herrmann Amandus    0.200000000000  4                       Schwartz    0.400000000000      1
2           Laurent             0.200000000000  5                       Schwartz    0.400000000000      2
2           Thomas              0.400000000000  1                       Simpson     0.200000000000      3
2           Thomas              0.400000000000  2                       Bayes       0.200000000000      4
2           Leonhard            0.200000000000  3                       Euler       0.200000000000      5

Finally, for each available category...

SELECT
    T.categoryId,
    TotalRecords = COUNT(1)
FROM
    #test AS T
GROUP BY
    T.categoryId

we can get the most frequent name and surname with their percentages with a few joins:

;WITH Totals AS
(
    SELECT
        T.categoryId,

        T.name,
        NameOccurencies = COUNT(T.name) OVER (PARTITION BY T.categoryId, T.name),
        NameTotals = COUNT(T.name) OVER (PARTITION BY T.categoryId),

        T.surname,
        SurnameOccurencies = COUNT(T.surname) OVER (PARTITION BY T.categoryId, T.surname),
        SurnameTotals = COUNT(T.surname) OVER (PARTITION BY T.categoryId)
    FROM
        #test AS T
),
MostFrequentRanking AS
(
    SELECT
        T.categoryId,

        T.name,
        NamePercentage = T.NameOccurencies * 1.0 / NULLIF(T.NameTotals, 0),
        NameMostFrequentRanking = ROW_NUMBER() OVER (
            PARTITION BY
                T.categoryId
            ORDER BY 
                T.NameOccurencies * 1.0 / NULLIF(T.NameTotals, 0) DESC),

        T.surname,
        SurnamePercentage = T.SurnameOccurencies * 1.0 / NULLIF(T.SurnameTotals, 0),
        SurnameMostFrequentRanking = ROW_NUMBER() OVER (
            PARTITION BY
                T.categoryId
            ORDER BY 
                T.SurnameOccurencies * 1.0 / NULLIF(T.SurnameTotals, 0) DESC)
    FROM
        Totals AS T
),
AvailableCategories AS
(
    SELECT
        T.categoryId,
        TotalRecords = COUNT(1)
    FROM
        #test AS T
    GROUP BY
        T.categoryId
)
SELECT
    A.categoryId,
    A.TotalRecords,
    MN.name,
    NamePercentage = CONVERT(DECIMAL(3, 2), MN.NamePercentage),
    MS.surname,
    SurnamePercentage = CONVERT(DECIMAL(3, 2), MS.SurnamePercentage)
FROM
    AvailableCategories AS A
    LEFT JOIN MostFrequentRanking AS MN ON 
        A.categoryId = MN.categoryId AND
        MN.NameMostFrequentRanking = 1
    LEFT JOIN MostFrequentRanking AS MS ON 
        A.categoryId = MS.categoryId AND
        MS.SurnameMostFrequentRanking = 1

Result:

categoryId  TotalRecords    name    NamePercentage  surname     SurnamePercentage
1           4               John    0.50            Mill        0.50
2           5               Thomas  0.40            Schwartz    0.40

It might be a little big, but you can edit this query with as many new columns you want without adding new SELECT, just repeating the same logic with each new column you want to display and an additional join at the end.

If you have millions of records and query takes long, you might want split each CTE into a temporary table with SELECT ... INTO + CREATE INDEX by categoryId to speed up the process (if you are willing to spend a few resources by creating these tables)