PostgreSQL GROUP BY Possible Sequences Explained

gaps-and-islandspostgresql

I have a table containing the following data, using Postgres 9.6:

log_id | sequence | made_at (timestamp)
206480    1            1
206480    1            2
206480    2            3
206480    3            4
206480    1            5
206480    2            6
206480    4            7
206480    5            8
206480    1            9
206480    2           10
206481    1           11
206481    2           12
206481    3           13
206481    4           14

I have to group and aggregate on the ID so I get an array of possible sequences. In the end I want the data to look like this:

log_id | sequence
206480  {1,1,2,3}
206480  {1,2,4,5}
206480  {1,2}
206481  {1,2,3,4}

I want a new row (with the sequences) when:

the log_id changes; or
the next sequence number is lower than the current sequence number.

There is another column which specifies the ordering (a timestamp), but it's in another table (I join them and use that timestamp). I left it out to make things easier, but we can assume the column is called made_at.

Best Answer

select      log_id
           ,array_agg (sequence)

from       (select      log_id 
                       ,sequence
                       ,count (is_restart) over
                        (
                            partition by    log_id 
                            order by        made_at
                        ) as restart_id

            from        (select      made_at
                                    ,log_id 
                                    ,sequence
                                    ,case 
                                         when sequence <
                                              lag (sequence) over
                                              (
                                                  partition by    log_id 
                                                  order by        made_at
                                              ) 
                                         then 1
                                     end            is_restart

                         from        logs
                         ) l
            ) l

group by    log_id      
           ,restart_id

order by    log_id      
           ,restart_id
;

+--------+-----------+
| log_id | array_agg |
+--------+-----------+
| 206480 | {1,1,2,3} |
+--------+-----------+
| 206480 | {1,2,4,5} |
+--------+-----------+
| 206480 | {1,2}     |
+--------+-----------+
| 206481 | {1,2,3,4} |
+--------+-----------+

Walkthrough

Identify restarts by comparing current sequence to previous sequence (LAG).

select      made_at
           ,log_id 
           ,sequence

           ,case 
                when sequence <
                     lag (sequence) over
                     (
                         partition by    log_id 
                         order by        made_at
                     ) 
                then 1
            end            is_restart

from        logs

+---------+--------+----------+------------+
| made_at | log_id | sequence | is_restart |
+---------+--------+----------+------------+
| 1       | 206480 | 1        |            |
+---------+--------+----------+------------+
| 2       | 206480 | 1        |            |
+---------+--------+----------+------------+
| 3       | 206480 | 2        |            |
+---------+--------+----------+------------+
| 4       | 206480 | 3        |            |
+---------+--------+----------+------------+
| 5       | 206480 | 1        | 1          |
+---------+--------+----------+------------+
| 6       | 206480 | 2        |            |
+---------+--------+----------+------------+
| 7       | 206480 | 4        |            |
+---------+--------+----------+------------+
| 8       | 206480 | 5        |            |
+---------+--------+----------+------------+
| 9       | 206480 | 1        | 1          |
+---------+--------+----------+------------+
| 10      | 206480 | 2        |            |
+---------+--------+----------+------------+
| 11      | 206481 | 1        |            |
+---------+--------+----------+------------+
| 12      | 206481 | 2        |            |
+---------+--------+----------+------------+
| 13      | 206481 | 3        |            |
+---------+--------+----------+------------+
| 14      | 206481 | 4        |            |
+---------+--------+----------+------------+

Do "running counts" (similar to "Running totals") of restarts (is_restart).
Rows that belongs to the same group will have the same count (AKA restart_id).
The "Order by" in COUNT implies range between unbounded preceding and current row

select      log_id 
           ,sequence
           ,count (is_restart) over
            (
                partition by    log_id 
                order by        made_at
            ) as group_id

from        (...) l

+--------+----------+----------+
| log_id | sequence | group_id |
+--------+----------+----------+
| 206480 | 1        | 0        |
+--------+----------+----------+
| 206480 | 1        | 0        |
+--------+----------+----------+
| 206480 | 2        | 0        |
+--------+----------+----------+
| 206480 | 3        | 0        |
+--------+----------+----------+
| 206480 | 1        | 1        |
+--------+----------+----------+
| 206480 | 2        | 1        |
+--------+----------+----------+
| 206480 | 4        | 1        |
+--------+----------+----------+
| 206480 | 5        | 1        |
+--------+----------+----------+
| 206480 | 1        | 2        |
+--------+----------+----------+
| 206480 | 2        | 2        |
+--------+----------+----------+
| 206481 | 1        | 0        |
+--------+----------+----------+
| 206481 | 2        | 0        |
+--------+----------+----------+
| 206481 | 3        | 0        |
+--------+----------+----------+
| 206481 | 4        | 0        |
+--------+----------+----------+

Group by log_id and restart_id and aggregate sequence

select      log_id
           ,array_agg (sequence)

from       (...) l

group by    log_id      
           ,restart_id

order by    log_id      
           ,restart_id
;

+--------+-----------+
| log_id | array_agg |
+--------+-----------+
| 206480 | {1,1,2,3} |
+--------+-----------+
| 206480 | {1,2,4,5} |
+--------+-----------+
| 206480 | {1,2}     |
+--------+-----------+
| 206481 | {1,2,3,4} |
+--------+-----------+

Related Solutions

PostgreSQL Group Roles – Understanding and Managing

It sounds like what you probably want is to:

Create a role to own all the common tables and schema, or just use your own if you really will always be the only one with full control of the main tables.
Create another role you intend to give only read-only access to the shared tables and schemas. GRANT that role rights using GRANT SELECT ON ALL TABLES IN SCHEMA [x] for each shared schema. You may also want to ALTER DEFAULT PRIVILEGES to make sure this role has read rights on any new tables created in these schemas too.
Now GRANT each user membership of the read-only access role with INHERIT.
For the private schemas, create a schema the same as the user's username with CREATE SCHEMA [username] AUTHORIZATION [username] or the older style where you create the schema then ALTER SCHEMA ... OWNER TO.

See the postgresql manual for the detailed syntax of all of the above commands. Start with user management, part of the broader database administration topic that includes grant management etc. The PostgreSQL manual is detailed, comprehensive and readable: reading it is strongly recommended.

Postgresql – Find rows where integer sequence contains a given subsequence

If you are looking for significant performance improvements to dnoeth's answer, consider using a native C-function and creating the appropriate operator.

Here is an example for int4 arrays. (A generic array variant and the corresponding SQL script).

Datum
_int_sequence_contained(PG_FUNCTION_ARGS)
{
    return DirectFunctionCall2(_int_contains_sequence,
                               PG_GETARG_DATUM(1),
                               PG_GETARG_DATUM(0));
}

Datum
_int_contains_sequence(PG_FUNCTION_ARGS)
{
    ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
    ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
    int         na, nb;
    int32      *pa, *pb;
    int         i, j;

    na = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
    nb = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
    pa = (int32 *) ARR_DATA_PTR(a);
    pb = (int32 *) ARR_DATA_PTR(b);

    /* The naive searching algorithm. Replace it with a better one if your arrays are quite large. */
    for (i = 0; i <= na - nb; ++i)
    {
        for (j = 0; j < nb; ++j)
            if (pa[i + j] != pb[j])
                break;

        if (j == nb)
            PG_RETURN_BOOL(true);
    }

    PG_RETURN_BOOL(false);
}

CREATE FUNCTION _int_contains_sequence(_int4, _int4)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;

CREATE FUNCTION _int_sequence_contained(_int4, _int4)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;

CREATE OPERATOR @@> (
  LEFTARG = _int4,
  RIGHTARG = _int4,
  PROCEDURE = _int_contains_sequence,
  COMMUTATOR = '<@@',
  RESTRICT = contsel,
  JOIN = contjoinsel
);

CREATE OPERATOR <@@ (
  LEFTARG = _int4,
  RIGHTARG = _int4,
  PROCEDURE = _int_sequence_contained,
  COMMUTATOR = '@@>',
  RESTRICT = contsel,
  JOIN = contjoinsel
);

Now you can filter rows like this.

SELECT * FROM sequences WHERE sequence @@> '{12, 742, 225, 547}'

I have conducted a little experiment to find how much faster this solution is.

CREATE TEMPORARY TABLE sequences AS
SELECT array_agg((random() * 10)::int4) AS sequence, g1 AS id
FROM generate_series(1, 100000) g1
  CROSS JOIN generate_series(1, 30) g2
GROUP BY g1;

EXPLAIN ANALYZE SELECT * FROM sequences
WHERE        translate(cast(sequence as text), '{}',',,')
 LIKE '%' || translate(cast('{1,2,3,4}'as text), '{}',',,') || '%'

"Seq Scan on sequences  (cost=0.00..7869.42 rows=28 width=36) (actual time=2.487..334.318 rows=251 loops=1)"
"  Filter: (translate((sequence)::text, '{}'::text, ',,'::text) ~~ '%,1,2,3,4,%'::text)"
"  Rows Removed by Filter: 99749"
"Planning time: 0.104 ms"
"Execution time: 334.365 ms"

EXPLAIN ANALYZE SELECT * FROM sequences WHERE sequence @@> '{1,2,3,4}'

"Seq Scan on sequences  (cost=0.00..5752.01 rows=282 width=36) (actual time=0.178..20.792 rows=251 loops=1)"
"  Filter: (sequence @@> '{1,2,3,4}'::integer[])"
"  Rows Removed by Filter: 99749"
"Planning time: 0.091 ms"
"Execution time: 20.859 ms"

So, it is about 16 times faster. If it is not enough, you can add support for GIN or GiST indexes, but this will be much more difficult task.

Best Answer

Walkthrough

Related Solutions

PostgreSQL Group Roles – Understanding and Managing

Postgresql – Find rows where integer sequence contains a given subsequence

Related Question