MySQL – How to Delete All Records Except Latest 10 for Each User

greatest-n-per-groupMySQL

I have an activities table that has a foreign key reference to a users table. A user can have many activities, and I have a job that will populate their activities on a daily basis. After that job runs, I want to run a SQL query that will delete the oldest records per user and keep the latest 10. This is the schema:

users

id name
1  Jimmy
2  Johnny

user_activities

id user_id activity date
1  1       foo      2020-08-07 00:00:00
2  1       bar      2020-08-08 00:00:00
3  1       baz      2020-08-09 00:00:00
4  2       foo      2020-08-07 00:00:00
5  2       bar      2020-08-08 00:00:00
6  2       baz      2020-08-09 00:00:00

How can I write a query to truncate records up to the latest 10 for each user?

Best Answer

This works with mysql 5.x and abouve

mysql has since version the window function ROW_NUMBER so that the subquery can be written without user defined variables

I added some data to show that it works

The idea is simple, sort all dates by user_id and keep the 10 newest, the rownumber defines the position of the record sorted by date descending

CREATE TABLE user_date
    (`id` int, `user_id` int, `activity` varchar(3), `date` varchar(19))
;
    
INSERT INTO user_date
    (`id`, `user_id`, `activity`, `date`)
VALUES
    (1, 1, 'foo', '2020-08-07 00:00:00'),
    (2, 1, 'bar', '2020-08-08 00:00:00'),
    (3, 1, 'baz', '2020-08-09 00:00:00'),
    (4, 2, 'foo', '2020-08-07 00:00:00'),
    (5, 2, 'bar', '2020-08-08 00:00:00'),
    (6, 2, 'baz', '2020-08-09 00:00:00')
    ,
    (7, 2, 'baz', '2020-08-10 00:00:00'),
    (8, 2, 'baz', '2020-08-10 00:00:00'),
    (9, 2, 'baz', '2020-08-10 00:00:00'),
    (10, 2, 'baz', '2020-08-10 00:00:00'),
    (11, 2, 'baz', '2020-08-10 00:00:00'),
    (12, 2, 'baz', '2020-08-10 00:00:00'),
    (13, 2, 'baz', '2020-08-10 00:00:00'),
    (14, 2, 'baz', '2020-08-10 00:00:00'),
    (15, 2, 'baz', '2020-08-10 00:00:00'),
    (16, 2, 'baz', '2020-08-10 00:00:00')
;
DELETE u1
FROM
       user_date AS u1
   JOIN
(
SELECT  id
  FROM
     (
      SELECT  @row_num := IF(@prev_value=o.user_id,@row_num+1,1) AS 'RowNumber'
             ,o.id
             ,@prev_value := o.user_id
        FROM user_date o,
             (SELECT @row_num := 1) x,
             (SELECT @prev_value := '') y
       ORDER BY o.user_id, o.`date` DESC
     ) subquery
 WHERE `RowNumber` > 10 ) u2 ON u1.id = u2.id  
 
SELECT * FROM user_date
id | user_id | activity | date               
-: | ------: | :------- | :------------------
 1 |       1 | foo      | 2020-08-07 00:00:00
 2 |       1 | bar      | 2020-08-08 00:00:00
 3 |       1 | baz      | 2020-08-09 00:00:00
 7 |       2 | baz      | 2020-08-10 00:00:00
 8 |       2 | baz      | 2020-08-10 00:00:00
 9 |       2 | baz      | 2020-08-10 00:00:00
10 |       2 | baz      | 2020-08-10 00:00:00
11 |       2 | baz      | 2020-08-10 00:00:00
12 |       2 | baz      | 2020-08-10 00:00:00
13 |       2 | baz      | 2020-08-10 00:00:00
14 |       2 | baz      | 2020-08-10 00:00:00
15 |       2 | baz      | 2020-08-10 00:00:00
16 |       2 | baz      | 2020-08-10 00:00:00

db<>fiddle here

For Sql Server

CREATE TABLE user_date
    (id int, user_id int, activity varchar(3), date varchar(19))
;
    
INSERT INTO user_date
    (id, user_id, activity, date)
VALUES
    (1, 1, 'foo', '2020-08-07 00:00:00'),
    (2, 1, 'bar', '2020-08-08 00:00:00'),
    (3, 1, 'baz', '2020-08-09 00:00:00'),
    (4, 2, 'foo', '2020-08-07 00:00:00'),
    (5, 2, 'bar', '2020-08-08 00:00:00'),
    (6, 2, 'baz', '2020-08-09 00:00:00')
    ,
    (7, 2, 'baz', '2020-08-10 00:00:00'),
    (8, 2, 'baz', '2020-08-10 00:00:00'),
    (9, 2, 'baz', '2020-08-10 00:00:00'),
    (10, 2, 'baz', '2020-08-10 00:00:00'),
    (11, 2, 'baz', '2020-08-10 00:00:00'),
    (12, 2, 'baz', '2020-08-10 00:00:00'),
    (13, 2, 'baz', '2020-08-10 00:00:00'),
    (14, 2, 'baz', '2020-08-10 00:00:00'),
    (15, 2, 'baz', '2020-08-10 00:00:00'),
    (16, 2, 'baz', '2020-08-10 00:00:00')
;
GO
DELETE u1
FROM
       user_date AS u1
   JOIN
(
SELECT  id
  FROM
     (
      SELECT    ROW_NUMBER() OVER(PARTITION BY o.user_id ORDER BY o.[date] DESC) 
                    AS RowNumber
             ,o.id
             ,o.user_id
        FROM user_date o
     ) subquery
 WHERE RowNumber > 10 ) u2 ON u1.id = u2.id  
 
GO

3 rows affected

SELECT * FROM user_date
GO
id | user_id | activity | date               
-: | ------: | :------- | :------------------
 1 |       1 | foo      | 2020-08-07 00:00:00
 2 |       1 | bar      | 2020-08-08 00:00:00
 3 |       1 | baz      | 2020-08-09 00:00:00
 7 |       2 | baz      | 2020-08-10 00:00:00
 8 |       2 | baz      | 2020-08-10 00:00:00
 9 |       2 | baz      | 2020-08-10 00:00:00
10 |       2 | baz      | 2020-08-10 00:00:00
11 |       2 | baz      | 2020-08-10 00:00:00
12 |       2 | baz      | 2020-08-10 00:00:00
13 |       2 | baz      | 2020-08-10 00:00:00
14 |       2 | baz      | 2020-08-10 00:00:00
15 |       2 | baz      | 2020-08-10 00:00:00
16 |       2 | baz      | 2020-08-10 00:00:00

db<>fiddle here