How to Compare One Column Between Two Rows in MySQL

MySQL

I am trying to create a report where employees pick up a task from the queue before they have completed the previous task. The tabular representation of it as shown below is each task has a job number, and employee have employee id. Each job has a task duration, that is when it was created and completed. It also has an assign time when the respective employee picked up the job. So the data looks something like below

+----+------------+--------+---------------------+---------------------+---------------------+
| id | job_number | emp_id | created_at          | assigned_at         | completed_at        |
+----+------------+--------+---------------------+---------------------+---------------------+
|  1 |          1 |      1 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|  2 |          2 |      1 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
|  3 |          3 |      1 | 2019-06-28 02:00:00 | 2019-06-28 02:15:00 | 2019-06-28 03:00:00 |
|  4 |          4 |      1 | 2019-06-28 05:00:00 | 2019-06-28 05:15:00 | 2019-06-28 06:00:00 |
|  5 |          5 |      2 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|  6 |          6 |      2 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
|  7 |          7 |      2 | 2019-06-28 03:00:00 | 2019-06-28 03:05:00 | 2019-06-28 03:30:00 |
|  8 |          8 |      3 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|  9 |          9 |      3 | 2019-06-28 02:00:00 | 2019-06-28 02:15:00 | 2019-06-28 03:00:00 |
+----+------------+--------+---------------------+---------------------+---------------------+

The definition of simultaneously picked up job, is when an employee has an assigned at time, smaller than completed at and larger than created at of a job that the same employee is doing. So for the above table the result should look something like below.

+------------+--------+---------------------+---------------------+---------------------+
| job_number | emp_id | created_at          | assigned_at         | completed_at        |
+------------+--------+---------------------+---------------------+---------------------+
|          1 |      1 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          2 |      1 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
|          3 |      1 | 2019-06-28 02:00:00 | 2019-06-28 02:15:00 | 2019-06-28 03:00:00 |
|          5 |      2 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          6 |      2 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
+------------+--------+---------------------+---------------------+---------------------+

So employee 1 picked up job 2 while job 1 was still in progress and then job 3 while job 2 was in progress. Similarly employee 2 picked up job 6 while job 5 was in progress.

Do note that employee 5 has no over lapping jobs, and jobs 4 and 7 do not overlap.

How do I write an efficient query to get the above result. I have solution, but not a SQL person, so do not think the solution is optimized.

Following is my attempt for any one to follow

-- create a new test schema
create schema test;

use  test;

-- create the test table
create table if not exists test_jobs (
    id int not null auto_increment primary key
  , job_number int not null default 0
  , emp_id int not null default 0
  , created_at datetime not null default current_timestamp
  , assigned_at datetime not null default current_timestamp
  , completed_at datetime not null default current_timestamp
) engine=innodb character set=utf8;

-- seed test data
insert into test_jobs
(job_number, emp_id, created_at, assigned_at, completed_at)
values
(1, 1, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(2, 1, '2019-06-28 01:30:00', '2019-06-28 01:45:00', '2019-06-28 02:30:00'),
(3, 1, '2019-06-28 02:00:00', '2019-06-28 02:15:00', '2019-06-28 03:00:00'),
(4, 1, '2019-06-28 05:00:00', '2019-06-28 05:15:00', '2019-06-28 06:00:00'),
(5, 2, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(6, 2, '2019-06-28 01:30:00', '2019-06-28 01:45:00', '2019-06-28 02:30:00'),
(7, 2, '2019-06-28 03:00:00', '2019-06-28 03:05:00', '2019-06-28 03:30:00'),
(8, 3, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(9, 3, '2019-06-28 02:00:00', '2019-06-28 02:15:00', '2019-06-28 03:00:00');

-- make sure test data is properly seeded
select * from test_jobs;

-- Get the jobs are were started in between another job
create temporary table test_jobs_1
select
    tj1.job_number
  , tj1.emp_id
  , tj1.created_at
  , tj1.assigned_at
  , tj1.completed_at
from
  test_jobs tj1 inner join test_jobs tj2
  on tj1.emp_id = tj2.emp_id 
  where tj1.job_number <> tj2.job_number
  and tj1.assigned_at between tj2.created_at and tj2.completed_at;

-- make sure you get the right data
select * from test_jobs_1;
+------------+--------+---------------------+---------------------+---------------------+
| job_number | emp_id | created_at          | assigned_at         | completed_at        |
+------------+--------+---------------------+---------------------+---------------------+
|          2 |      1 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
|          3 |      1 | 2019-06-28 02:00:00 | 2019-06-28 02:15:00 | 2019-06-28 03:00:00 |
|          6 |      2 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
+------------+--------+---------------------+---------------------+---------------------+

-- create copy of this temporary table to get around pesky issue of not being able to reopen temporary table
create temporary table temp_test_jobs select * from test_jobs_1;

-- Now create another table which trims data to only employees who can have did more than
-- one job simultaneously and the jobs that were not included in previous basically minus 
-- set of test_jobs_1 but only for the selected employees
create temporary table test_jobs_2
select
    tj.job_number
  , tj.emp_id
  , tj.created_at
  , tj.assigned_at
  , tj.completed_at
from test_jobs tj 
where tj.job_number not in (select distinct(job_number) from test_jobs_1)
and tj.emp_id in (select distinct(emp_id) from temp_test_jobs);

-- make sure you have correct test data
select * from test_jobs_2;

+------------+--------+---------------------+---------------------+---------------------+
| job_number | emp_id | created_at          | assigned_at         | completed_at        |
+------------+--------+---------------------+---------------------+---------------------+
|          1 |      1 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          4 |      1 | 2019-06-28 05:00:00 | 2019-06-28 05:15:00 | 2019-06-28 06:00:00 |
|          5 |      2 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          7 |      2 | 2019-06-28 03:00:00 | 2019-06-28 03:05:00 | 2019-06-28 03:30:00 |
+------------+--------+---------------------+---------------------+---------------------+

-- Now out of that minus set called test_jobs_2 find  out who were the root jobs
-- or those that got failed to include in first one
create temporary table test_jobs_3
select 
    tj2.job_number
  , tj2.emp_id
  , tj2.created_at
  , tj2.assigned_at
  , tj2.completed_at
from test_jobs_2 tj2 inner join test_jobs_1 tj1 on tj2.emp_id = tj1.emp_id
where tj2.created_at < tj1.assigned_at and tj2.completed_at > tj1.assigned_at;

-- make sure you got the correct data
select * from test_jobs_3;
+------------+--------+---------------------+---------------------+---------------------+
| job_number | emp_id | created_at          | assigned_at         | completed_at        |
+------------+--------+---------------------+---------------------+---------------------+
|          1 |      1 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          5 |      2 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
+------------+--------+---------------------+---------------------+---------------------+

-- Final result 
select tj.job_number, tj.emp_id, tj.created_at, tj.assigned_at, tj.completed_at
from (select * from test_jobs_1 union select * from test_jobs_3) tj order by tj.job_number;

+------------+--------+---------------------+---------------------+---------------------+
| job_number | emp_id | created_at          | assigned_at         | completed_at        |
+------------+--------+---------------------+---------------------+---------------------+
|          1 |      1 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          2 |      1 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
|          3 |      1 | 2019-06-28 02:00:00 | 2019-06-28 02:15:00 | 2019-06-28 03:00:00 |
|          5 |      2 | 2019-06-28 01:00:00 | 2019-06-28 01:15:00 | 2019-06-28 02:00:00 |
|          6 |      2 | 2019-06-28 01:30:00 | 2019-06-28 01:45:00 | 2019-06-28 02:30:00 |
+------------+--------+---------------------+---------------------+---------------------+

Can this be done in lesser number of queries or optimized better. This is just example dataset and data is larger in my actual data set.

Any suggestions welcome.
Thank you.

Best Answer

You can use a series of chained CTEs (or Common Table Expressions). In many ways, a CTE is very similar to a temporary table or perhaps a temporary view that lasts for the duration of the query - they are really, really good at simplifying complex logic of the sort that your SQL requires.

It becomes a convenient way to manage complicated queries.

and:

Why Do you need CTE’s? There are several reasons why you may want to use a CTE over other methods. Some of them include:

Readability – CTE’s promote readability. Rather than lump all you query logic into one large query, create several CTE’s, which are the combined later in the statement. This lets you get the chunks of data you need and combine them in a final SELECT. Substitute for a View – You can substitute a CTE for a view.

This is handy if you don’t have permissions to create a view object or you don’t want to create one as it is only used in this one query. Recursion – Use CTE’s do create recursive queries, that is queries that can call themselves. This is handy when you need to work on hierarchical data such as organization charts.

Limitations – Overcome SELECT statement limitations, such as referencing itself (recursion), or performing GROUP BY using non-deterministic functions. Ranking – Whenever you want to use ranking function such as ROW_NUMBER(), RANK(), NTILE() etc.

In the same vein, from here:

How are CTEs helpful? CTEs, like database views and derived tables, enable users to more easily write and maintain complex queries via increased readability and simplification. This reduction in complexity is achieved by deconstructing ordinarily complex queries into simple blocks to be used, and reused if necessary, in rewriting the query. Example use cases include:

Needing to reference a derived table multiple times in a single query An alternative to creating a view in the database Performing the same calculation multiple times over across multiple query components

From here, you have:

Common Table Expressions, or CTE for short, is simply a technique to create a temporary set of records that can be referenced within an INSERT, SELECT, UPDATE or DELETE statement.

Well, you get the idea! You're obviously very competent at SQL, so you should have no problems! :-)

There is another aspect to CTEs only touched upon above - that is self-referential CTEs, better known as RECURSIVE CTEs - see here and here. If you're a (Computer Language/Computer Science) nerd - they turn SQL into a Turing Complete language - see here for a link to an SQL Mandelbrot set! Well worth getting to grips with these concepts.

Phew... now, after all that, to answer the actual question. As I said at the beginning, using a chained set of CTEs solves the problem in much the same way as you "chained" your temporary tables. The DDL and DML at the bottom of this answer and also in the fiddle here.

The SQL is rather long and identical - basically just rearranged - to the SQL in the question itself, so I'll put the result first (same as your one, of course):

job_number  emp_id           created_at         assigned_at        completed_at
_______________________________________________________________________________      
         1       1  2019-06-28 01:00:00 2019-06-28 01:15:00 2019-06-28 02:00:00
         2       1  2019-06-28 01:30:00 2019-06-28 01:45:00 2019-06-28 02:30:00
         3       1  2019-06-28 02:00:00 2019-06-28 02:15:00 2019-06-28 03:00:00
         5       2  2019-06-28 01:00:00 2019-06-28 01:15:00 2019-06-28 02:00:00
         6       2  2019-06-28 01:30:00 2019-06-28 01:45:00 2019-06-28 02:30:00

The consolidated SQL using chained CTEs.

WITH cte1 AS
(
SELECT
    tj1.job_number
  , tj1.emp_id
  , tj1.created_at
  , tj1.assigned_at
  , tj1.completed_at
FROM
  test_jobs tj1 INNER JOIN test_jobs tj2
    ON tj1.emp_id = tj2.emp_id 
  WHERE tj1.job_number <> tj2.job_number
  AND tj1.assigned_at between tj2.created_at AND tj2.completed_at
),
cte2 AS
(
  SELECT
    tj.job_number
    , tj.emp_id
    , tj.created_at
    , tj.assigned_at
    , tj.completed_at
  FROM test_jobs tj 
  WHERE tj.job_number NOT IN (SELECT DISTINCT (job_number) FROM cte1)
  AND tj.emp_id IN (SELECT DISTINCT (emp_id) FROM cte1)
),
cte3 AS
(
  SELECT 
    tj2.job_number
    , tj2.emp_id
    , tj2.created_at
    , tj2.assigned_at
    , tj2.completed_at
  FROM cte2 tj2 INNER JOIN cte1 tj1 ON tj2.emp_id = tj1.emp_id
  WHERE tj2.created_at < tj1.assigned_at 
  AND tj2.completed_at > tj1.assigned_at
)
SELECT
  tj.job_number, 
  tj.emp_id, 
  tj.created_at, 
  tj.assigned_at, 
  tj.completed_at
FROM 
(
  SELECT * FROM cte1 UNION SELECT * FROM cte3
) tj 
ORDER BY tj.job_number;

=============== DDL and DML ==============

CREATE TABLE IF NOT EXISTS test_jobs 
(
  id INT NOT NULL AUTO_INCREMENT PRIMARY KEY
  , job_number INT NOT NULL DEFAULT 0
  , emp_id INT NOT NULL DEFAULT 0
  , created_at datetime NOT NULL DEFAULT current_timestamp
  , assigned_at datetime NOT NULL DEFAULT current_timestamp
  , completed_at datetime NOT NULL DEFAULT current_timestamp
) ENGINE = InnoDB CHARACTER SET = utf8;

INSERT INTO test_jobs
(job_number, emp_id, created_at, assigned_at, completed_at)
VALUES
(1, 1, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(2, 1, '2019-06-28 01:30:00', '2019-06-28 01:45:00', '2019-06-28 02:30:00'),
(3, 1, '2019-06-28 02:00:00', '2019-06-28 02:15:00', '2019-06-28 03:00:00'),
(4, 1, '2019-06-28 05:00:00', '2019-06-28 05:15:00', '2019-06-28 06:00:00'),
(5, 2, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(6, 2, '2019-06-28 01:30:00', '2019-06-28 01:45:00', '2019-06-28 02:30:00'),
(7, 2, '2019-06-28 03:00:00', '2019-06-28 03:05:00', '2019-06-28 03:30:00'),
(8, 3, '2019-06-28 01:00:00', '2019-06-28 01:15:00', '2019-06-28 02:00:00'),
(9, 3, '2019-06-28 02:00:00', '2019-06-28 02:15:00', '2019-06-28 03:00:00');