Get qualifying rows only
One way ..
WITH x AS (
SELECT *
,row_number() OVER (PARTITION BY reference ORDER BY date_out DESC) AS rn
FROM tbl
)
, y AS (
SELECT *
FROM x
WHERE x.rn = 1
AND date_out >= '2012-10-01'
AND date_out < '2012-11-01'
)
, z AS (
SELECT x.*
FROM x
JOIN y USING (reference)
WHERE x.rn = 2
AND x.centreid = 1
)
SELECT y.taskid, y.reference, y.centreid, y.date_out
FROM y
JOIN z USING (reference)
UNION ALL
SELECT taskid, reference, centreid, date_out
FROM z
ORDER BY reference, date_out;
Another way:
WITH x AS (
SELECT *
,row_number() OVER (PARTITION BY reference ORDER BY date_out DESC) AS rn
FROM tbl
)
,y AS (
SELECT x.*, y.taskid AS taskid2, y.centreid AS centreid2, y.date_out AS date_out2
FROM x
JOIN x y USING (reference)
WHERE x.rn = 1
AND x.date_out >= '2012-10-01'
AND x.date_out < '2012-11-01'
AND y.rn = 2
AND y.centreid = 1
)
SELECT y.taskid, y.reference, y.centreid, y.date_out
FROM y
UNION ALL
SELECT y.taskid2, y.reference, y.centreid2, y.date_out2
FROM y
ORDER BY reference, date_out;
I'd expect the second one to be faster. Depends on your data distribution. Test with EXPLAIN ANALYZE
.
Get all rows for qualifying references
WITH x AS (
SELECT *
,row_number() OVER (PARTITION BY reference ORDER BY date_out DESC) AS rn
FROM tbl
)
,y AS (
SELECT reference
FROM x
JOIN x y USING (reference)
WHERE x.rn = 1
AND x.date_out >= '2012-10-01'
AND x.date_out < '2012-11-01'
AND y.rn = 2
AND y.centreid = 1
)
SELECT *
FROM tbl
JOIN y USING (reference)
ORDER BY reference, date_out;
-> sqlfiddle
Answer to follow-up in comment
Separate groups if more than 30 days between entries.
WITH a AS (
SELECT *
,lag(date_out) OVER (PARTITION BY reference ORDER BY date_out DESC) AS last_date
,CASE WHEN date_out >
(lag(date_out) OVER (PARTITION BY reference ORDER BY date_out DESC) - 30)
THEN 0 ELSE 1
END AS step
FROM tbl
)
,b AS (
SELECT *
,sum(step) OVER (PARTITION BY reference ORDER BY date_out DESC) AS grp
FROM a
)
,c AS (
SELECT *
,row_number() OVER (PARTITION BY reference, grp ORDER BY date_out DESC) AS rn
FROM b
)
,d AS (
SELECT reference, grp
FROM c
JOIN c d USING (reference, grp)
WHERE c.rn = 1
AND c.date_out >= '2012-10-01'
AND c.date_out < '2012-11-01'
AND d.rn = 2
AND d.centreid = 1
)
SELECT b.taskid, b.reference, b.centreid, b.date_out
FROM b
JOIN d USING (reference, grp)
ORDER BY reference, date_out
-> sqlfiddle
But while pure SQL is a beaut .. I would solve this procedurally in a plpgsql function. Very similar to this recent answer on SO. Would be faster, because it can be done in a single table scan.
I suggest a "data-modifying CTE":
WITH cte AS (
SELECT *, row_number() OVER () AS rn
FROM (
SELECT id
FROM tbl
ORDER BY random()
LIMIT 600 -- 2 x 300
) sub
)
INSERT INTO table2(source, target)
SELECT c1.id, c2.id
FROM cte c1
JOIN cte c2 ON c2.rn = c1.rn + 300;
In the CTE:
- select 600 random rows (to create 300 new rows)
- add a row number in the outer SELECT.
Then couple two values in a self join with 300 offset.
To get random rows from a huge table cheaply, consider:
Best way to select random rows PostgreSQL
Best Answer