Postgresql – Best index for a join table

indexindex-tuningpostgresql

I have two tables that are joined using a join table.
I tried to found the best index(es).

DROP TABLE IF EXISTS jobs CASCADE;
DROP TABLE IF EXISTS devices CASCADE;
DROP TABLE IF EXISTS jobs_devices CASCADE;

CREATE TABLE jobs (id INTEGER);
CREATE TABLE devices (id INTEGER);
CREATE TABLE jobs_devices(job_id INTEGER, device_id INTEGER);

INSERT INTO jobs (id) SELECT 
   (random()*20000)::int AS id 
FROM  
   generate_series (1,200000) AS x(n);

INSERT INTO devices (id) SELECT 
   (random()*20000)::int AS id 
FROM  
   generate_series (1,200000) AS x(n);

INSERT INTO jobs_devices (job_id, device_id) SELECT 
   (random()*20000)::int AS job_id,
   (random()*20000)::int AS device_id
FROM  
   generate_series (1,100000) AS x(n);

CREATE INDEX jobs_id_idx ON jobs(id);
CREATE INDEX devices_id_idx ON devices(id);

EXPLAIN ANALYZE SELECT * FROM jobs INNER JOIN jobs_devices ON jobs.id = jobs_devices.job_id WHERE jobs_devices.device_id = 10522;
/*
Merge Join  (cost=1649.51..18003.51 rows=474000 width=12) (actual time=67.770..67.776 rows=8 loops=1)
  Merge Cond: (jobs.id = jobs_devices.job_id)
  ->  Index Only Scan using jobs_id_idx on jobs  (cost=0.42..8744.42 rows=200000 width=4) (actual time=0.095..47.730 rows=84464 loops=1)
        Heap Fetches: 84464
  ->  Sort  (cost=1649.09..1650.28 rows=474 width=8) (actual time=14.654..14.655 rows=1 loops=1)
        Sort Key: jobs_devices.job_id
        Sort Method: quicksort  Memory: 25kB
        ->  Seq Scan on jobs_devices  (cost=0.00..1628.03 rows=474 width=8) (actual time=5.248..14.643 rows=1 loops=1)
              Filter: (device_id = 10522)
              Rows Removed by Filter: 99999
Planning time: 0.303 ms
Execution time: 67.809 ms*/

with multicolumn index

CREATE INDEX jobs_job_id_device_id_idx ON jobs_devices(job_id, device_id);

EXPLAIN ANALYZE SELECT * FROM jobs INNER JOIN jobs_devices ON jobs.id = jobs_devices.job_id WHERE jobs_devices.device_id = 10522;
/*
Nested Loop  (cost=4.50..1900.67 rows=52 width=12) (actual time=3.207..10.676 rows=8 loops=1)
  ->  Seq Scan on jobs_devices  (cost=0.00..1693.00 rows=5 width=8) (actual time=3.175..10.634 rows=1 loops=1)
        Filter: (device_id = 10522)
        Rows Removed by Filter: 99999
  ->  Bitmap Heap Scan on jobs  (cost=4.50..41.43 rows=10 width=4) (actual time=0.020..0.026 rows=8 loops=1)
        Recheck Cond: (id = jobs_devices.job_id)
        Heap Blocks: exact=8
        ->  Bitmap Index Scan on jobs_id_idx  (cost=0.00..4.50 rows=10 width=0) (actual time=0.015..0.015 rows=8 loops=1)
              Index Cond: (id = jobs_devices.job_id)
Planning time: 0.597 ms
Execution time: 10.707 ms
*/

with multicolumn index in right order

DROP INDEX jobs_job_id_device_id_idx;
--multicolumn index
CREATE INDEX jobs_device_id_job_idx ON jobs_devices(device_id, job_id); 

EXPLAIN ANALYZE SELECT * FROM jobs INNER JOIN jobs_devices ON jobs.id = jobs_devices.job_id WHERE jobs_devices.device_id = 10522;
/*
Nested Loop  (cost=4.79..232.05 rows=52 width=12) (actual time=0.044..0.051 rows=8 loops=1)
  ->  Index Only Scan using jobs_device_id_job_idx on jobs_devices  (cost=0.29..24.38 rows=5 width=8) (actual time=0.032..0.032 rows=1 loops=1)
        Index Cond: (device_id = 10522)
        Heap Fetches: 1
  ->  Bitmap Heap Scan on jobs  (cost=4.50..41.43 rows=10 width=4) (actual time=0.010..0.015 rows=8 loops=1)
        Recheck Cond: (id = jobs_devices.job_id)
        Heap Blocks: exact=8
        ->  Bitmap Index Scan on jobs_id_idx  (cost=0.00..4.50 rows=10 width=0) (actual time=0.006..0.006 rows=8 loops=1)
              Index Cond: (id = jobs_devices.job_id)
Planning time: 0.444 ms
Execution time: 0.072 ms
*/

with 2 index and without the multicolumn index

DROP INDEX jobs_device_id_job_idx;
CREATE INDEX jobs_device_id_idx ON jobs_devices(device_id);
CREATE INDEX jobs_job_id_idx ON jobs_devices(job_id);
EXPLAIN ANALYZE SELECT * FROM jobs INNER JOIN jobs_devices ON jobs.id = jobs_devices.job_id WHERE jobs_devices.device_id = 10522;
/*
Nested Loop  (cost=8.83..230.47 rows=52 width=12) (actual time=0.028..0.036 rows=8 loops=1)
  ->  Bitmap Heap Scan on jobs_devices  (cost=4.33..22.80 rows=5 width=8) (actual time=0.017..0.017 rows=1 loops=1)
        Recheck Cond: (device_id = 10522)
        Heap Blocks: exact=1
        ->  Bitmap Index Scan on jobs_device_id_idx  (cost=0.00..4.33 rows=5 width=0) (actual time=0.014..0.014 rows=1 loops=1)
              Index Cond: (device_id = 10522)
  ->  Bitmap Heap Scan on jobs  (cost=4.50..41.43 rows=10 width=4) (actual time=0.009..0.016 rows=8 loops=1)
        Recheck Cond: (id = jobs_devices.job_id)
        Heap Blocks: exact=8
        ->  Bitmap Index Scan on jobs_id_idx  (cost=0.00..4.50 rows=10 width=0) (actual time=0.007..0.007 rows=8 loops=1)
              Index Cond: (id = jobs_devices.job_id)
Planning time: 0.420 ms
Execution time: 0.059 ms
*/

ok…. two indexes…

Now with devices. I want to see if I can use indexes in different situations

with 2 index and without the multicolumn index

EXPLAIN ANALYZE SELECT * FROM devices INNER JOIN jobs_devices ON devices.id = jobs_devices.device_id WHERE jobs_devices.device_id = 10522;
/*
Nested Loop  (cost=8.83..64.87 rows=50 width=12) (actual time=0.032..0.051 rows=10 loops=1)
  ->  Bitmap Heap Scan on devices  (cost=4.50..41.43 rows=10 width=4) (actual time=0.021..0.033 rows=10 loops=1)
        Recheck Cond: (id = 10522)
        Heap Blocks: exact=10
        ->  Bitmap Index Scan on devices_id_idx  (cost=0.00..4.50 rows=10 width=0) (actual time=0.015..0.015 rows=10 loops=1)
              Index Cond: (id = 10522)
  ->  Materialize  (cost=4.33..22.83 rows=5 width=8) (actual time=0.001..0.001 rows=1 loops=10)
        ->  Bitmap Heap Scan on jobs_devices  (cost=4.33..22.80 rows=5 width=8) (actual time=0.007..0.008 rows=1 loops=1)
              Recheck Cond: (device_id = 10522)
              Heap Blocks: exact=1
              ->  Bitmap Index Scan on jobs_device_id_idx  (cost=0.00..4.33 rows=5 width=0) (actual time=0.005..0.005 rows=1 loops=1)
                    Index Cond: (device_id = 10522)
Planning time: 0.130 ms
Execution time: 0.088 ms
*/

but using the initial multicolumn index for our first column

DROP index jobs_device_id_idx;
DROP INDEX jobs_job_id_idx;
CREATE INDEX jobs_job_id_device_id_idx ON jobs_devices(job_id, device_id);
EXPLAIN ANALYZE SELECT * FROM devices INNER JOIN jobs_devices ON devices.id = jobs_devices.device_id WHERE jobs_devices.device_id = 10522;
/*
Nested Loop  (cost=4.50..1735.07 rows=50 width=12) (actual time=4.575..14.733 rows=10 loops=1)
  ->  Bitmap Heap Scan on devices  (cost=4.50..41.43 rows=10 width=4) (actual time=0.013..0.044 rows=10 loops=1)
        Recheck Cond: (id = 10522)
        Heap Blocks: exact=10
        ->  Bitmap Index Scan on devices_id_idx  (cost=0.00..4.50 rows=10 width=0) (actual time=0.010..0.010 rows=10 loops=1)
              Index Cond: (id = 10522)
  ->  Materialize  (cost=0.00..1693.03 rows=5 width=8) (actual time=0.456..1.468 rows=1 loops=10)
        ->  Seq Scan on jobs_devices  (cost=0.00..1693.00 rows=5 width=8) (actual time=4.558..14.670 rows=1 loops=1)
              Filter: (device_id = 10522)
              Rows Removed by Filter: 99999
Planning time: 0.255 ms
Execution time: 14.775 ms
*/

  • What are the pro-cons between 2 indexes (2 Bitmap Heap Scan) on one column and the multicolumn (Index Only Scan)?
  • It seems that 2 indexes are more "multipurpose"?

Best Answer

Two multcolumn indexes can give you some profit. When you have the id of one of the relations, postgresql would not have to consult the relation table to obtain the id of the other, since it is part of the index. But this will only be noticeable for larger selections. With two indexes on the individual ids, postgresql has to skip between index and table to continue to the other relation.