Sql-server – Gaps and islands: client solution vs T-SQL query

gaps-and-islandsperformancequery-performancesql server

Can a T-SQL solution for gaps and islands run faster than a C# solution running on the client?

To be specific, let us provide some test data:

CREATE TABLE dbo.Numbers
  (
    n INT NOT NULL
          PRIMARY KEY
  ) ; 
GO 

INSERT  INTO dbo.Numbers
        ( n )
VALUES  ( 1 ) ; 
GO 
DECLARE @i INT ; 
SET @i = 0 ; 
WHILE @i < 21 
  BEGIN 
    INSERT  INTO dbo.Numbers
            ( n 
            )
            SELECT  n + POWER(2, @i)
            FROM    dbo.Numbers ; 
    SET @i = @i + 1 ; 
  END ;  
GO

CREATE TABLE dbo.Tasks
  (
    StartedAt SMALLDATETIME NOT NULL ,
    FinishedAt SMALLDATETIME NOT NULL ,
    CONSTRAINT PK_Tasks PRIMARY KEY ( StartedAt, FinishedAt ) ,
    CONSTRAINT UNQ_Tasks UNIQUE ( FinishedAt, StartedAt )
  ) ;
GO

INSERT  INTO dbo.Tasks
        ( StartedAt ,
          FinishedAt
        )
        SELECT  DATEADD(MINUTE, n, '20100101') AS StartedAt ,
                DATEADD(MINUTE, n + 2, '20100101') AS FinishedAt
        FROM    dbo.Numbers
        WHERE   ( n < 500000
                  OR n > 500005
                )
GO

This first set of test data has exactly one gap:

SELECT  StartedAt ,
        FinishedAt
FROM    dbo.Tasks
WHERE   StartedAt BETWEEN DATEADD(MINUTE, 499999, '20100101')
                  AND     DATEADD(MINUTE, 500006, '20100101')

The second set of test data has 2M -1 gaps, a gap between each two adjacent intervals:

TRUNCATE TABLE dbo.Tasks;
GO

INSERT  INTO dbo.Tasks
        ( StartedAt ,
          FinishedAt
        )
        SELECT  DATEADD(MINUTE, 3*n, '20100101') AS StartedAt ,
                DATEADD(MINUTE, 3*n + 2, '20100101') AS FinishedAt
        FROM    dbo.Numbers
        WHERE   ( n < 500000
                  OR n > 500005
                )
GO

Currently I am running 2008 R2, but 2012 solutions are very welcome.
I have posted my C# solution as an answer.

Best Answer

And a 1 second solution...

;WITH cteSource(StartedAt, FinishedAt)
AS (
    SELECT      s.StartedAt,
            e.FinishedAt
    FROM        (
                SELECT  StartedAt,
                    ROW_NUMBER() OVER (ORDER BY StartedAt) AS rn
                FROM    dbo.Tasks
            ) AS s
    INNER JOIN  (
                SELECT  FinishedAt,
                    ROW_NUMBER() OVER (ORDER BY FinishedAt) + 1 AS rn
                FROM    dbo.Tasks
            ) AS e ON e.rn = s.rn
    WHERE       s.StartedAt > e.FinishedAt

    UNION ALL

    SELECT  MIN(StartedAt),
        MAX(FinishedAt)
    FROM    dbo.Tasks
), cteGrouped(theTime, grp)
AS (
    SELECT  u.theTime,
        (ROW_NUMBER() OVER (ORDER BY u.theTime) - 1) / 2
    FROM    cteSource AS s
    UNPIVOT (
            theTime
            FOR theColumn IN (s.StartedAt, s.FinishedAt)
        ) AS u
)
SELECT      MIN(theTime),
        MAX(theTime)
FROM        cteGrouped
GROUP BY    grp
ORDER BY    grp