PostgreSQL Query Optimization – How to Optimize a Subquery Based on a Join

performancepostgresqlpostgresql-performance

I have this query, that finds all active contracts for candidate who have had less than 20 total completed contracts, or less than 5 for the contract site.

EXPLAIN (ANALYZE,COSTS, VERBOSE, BUFFERS, FORMAT JSON)
SELECT "CONTRACTS".* FROM "CONTRACTS" 
WHERE "CONTRACTS"."ID" IN (
  SELECT "CONTRACTS"."ID" FROM "CONTRACTS" 
  INNER JOIN "ROLES" ON "ROLES"."ID" = "CONTRACTS"."ROLE_ID" 
  WHERE (
    CANDIDATE_ID NOT IN (
      SELECT DISTINCT "CONTRACTS"."CANDIDATE_ID" FROM "CONTRACTS" 
      WHERE "CONTRACTS"."STATE" IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED') 
      GROUP BY "CONTRACTS"."CANDIDATE_ID" 
      HAVING (COUNT(CONTRACTS.ID) >= 20)
    ) AND (CANDIDATE_ID, ROLES.SITE_ID) NOT IN (
      SELECT DISTINCT "CONTRACTS"."CANDIDATE_ID", "ROLES"."SITE_ID" FROM "CONTRACTS" 
      INNER JOIN "ROLES" ON "ROLES"."ID" = "CONTRACTS"."ROLE_ID" 
      WHERE "CONTRACTS"."STATE" IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED') 
      GROUP BY contracts.CANDIDATE_ID, ROLES.SITE_ID 
      HAVING (COUNT(ROLES.SITE_ID) >= 5)
    )
  )
)  AND "CONTRACTS"."STATE" = 'ACTIVE' AND (CONTRACTS.START_DATETIME >= '2018-06-16 13:16:09.013943' AND CONTRACTS.START_DATETIME <= '2018-06-16 17:16:09.015998')

~45% of the query time is caused by a sequence scan to match the 2nd subquery. Is there any way to speed up that subquery?

The query plan is below;

Hash Semi Join  (cost=1714368.60..1714534.45 rows=21 width=1458) (actual time=12317.173..13364.060 rows=103 loops=1)
  Output: contracts.id, contracts.role_id, contracts.candidate_id, contracts.name, contracts.internal_name, contracts.description, contracts.created_at, contracts.updated_at, contracts.start_date, contracts.hourly_rate_pennies, contracts.hourly_rate_currency, contracts.state, contracts.first_seen, contracts.sent, contracts.waitlist, contracts.date_time_info, contracts.dress_code, contracts.why_work_here, contracts.reason_for_cancelling, contracts.reason_for_rejecting, contracts.dismissed, contracts.calculable, contracts.archived, contracts.fixed_rate, contracts.cancelled_by_candidate_at, contracts.hourly_rate_invoice_pennies, contracts.hourly_rate_invoice_currency, contracts.bonus, contracts.manual_push_notification_sent, contracts.helpful_directions, contracts.entry_point, contracts.who_to_ask_for, contracts.what_to_do_when_nothing_to_do, contracts.standby_role_id, contracts.checked_in, contracts.checked_in_by_id, contracts.checked_in_at, contracts.checked_in_not_required, contracts.checked_in_not_required_by_id, contracts.checked_in_not_required_at, contracts.messaged, contracts.messaged_by_id, contracts.messaged_at, contracts.notes, contracts.checked_in_by_type, contracts.start_datetime, contracts.end_datetime, contracts.business_id, contracts.earliest_end_datetime, contracts.travel_time_minutes, contracts.start_time_hours, contracts.start_time_minutes, contracts.finish_time_hours, contracts.finish_time_minutes, contracts.rating, contracts.length_hours, contracts.break_hours, contracts.length_minutes, contracts.break_minutes, contracts.employer_rating, contracts.discount, contracts.on_time, contracts.feedback_from_candidate, contracts.ignore_rating, contracts.feedback_department, contracts.feedback_supervisor, contracts.paid, contracts.payroll_run_id, contracts.paid_at, contracts.timesheet_recorded_at, contracts.submitted_shifts_count, contracts.blocked_at, contracts.favourited_at, contracts.candidate_favourited, contracts.employer_should_be_invoiced, contracts.pay_candidate_minimum, contracts.tier_number, contracts.feedback_best_thing, contracts.feedback_worst_thing, contracts.feedback_accurate_description, contracts.candidate_responded_at, contracts.candidate_use_again_set_at, contracts.address, contracts.latitude, contracts.longitude, contracts.instant_claim, contracts.additional_payment_id, contracts.favourites_text_sent_at, contracts.last_braze_reminder, contracts.charge_total_pennies, contracts.charge_total_currency, contracts.invoice_id
  Hash Cond: (contracts.id = contracts_1.id)
  Buffers: shared hit=510476 read=1014439 dirtied=7, temp read=17534 written=17528
  I/O Timings: read=2096.216
  ->  Bitmap Heap Scan on public.contracts  (cost=866.14..1031.87 rows=83 width=1458) (actual time=21.107..21.575 rows=155 loops=1)
        Output: contracts.id, contracts.role_id, contracts.candidate_id, contracts.name, contracts.internal_name, contracts.description, contracts.created_at, contracts.updated_at, contracts.start_date, contracts.hourly_rate_pennies, contracts.hourly_rate_currency, contracts.state, contracts.first_seen, contracts.sent, contracts.waitlist, contracts.date_time_info, contracts.dress_code, contracts.why_work_here, contracts.reason_for_cancelling, contracts.reason_for_rejecting, contracts.dismissed, contracts.calculable, contracts.archived, contracts.fixed_rate, contracts.cancelled_by_candidate_at, contracts.hourly_rate_invoice_pennies, contracts.hourly_rate_invoice_currency, contracts.bonus, contracts.manual_push_notification_sent, contracts.helpful_directions, contracts.entry_point, contracts.who_to_ask_for, contracts.what_to_do_when_nothing_to_do, contracts.standby_role_id, contracts.checked_in, contracts.checked_in_by_id, contracts.checked_in_at, contracts.checked_in_not_required, contracts.checked_in_not_required_by_id, contracts.checked_in_not_required_at, contracts.messaged, contracts.messaged_by_id, contracts.messaged_at, contracts.notes, contracts.checked_in_by_type, contracts.start_datetime, contracts.end_datetime, contracts.business_id, contracts.earliest_end_datetime, contracts.travel_time_minutes, contracts.start_time_hours, contracts.start_time_minutes, contracts.finish_time_hours, contracts.finish_time_minutes, contracts.rating, contracts.length_hours, contracts.break_hours, contracts.length_minutes, contracts.break_minutes, contracts.employer_rating, contracts.discount, contracts.on_time, contracts.feedback_from_candidate, contracts.ignore_rating, contracts.feedback_department, contracts.feedback_supervisor, contracts.paid, contracts.payroll_run_id, contracts.paid_at, contracts.timesheet_recorded_at, contracts.submitted_shifts_count, contracts.blocked_at, contracts.favourited_at, contracts.candidate_favourited, contracts.employer_should_be_invoiced, contracts.pay_candidate_minimum, contracts.tier_number, contracts.feedback_best_thing, contracts.feedback_worst_thing, contracts.feedback_accurate_description, contracts.candidate_responded_at, contracts.candidate_use_again_set_at, contracts.address, contracts.latitude, contracts.longitude, contracts.instant_claim, contracts.additional_payment_id, contracts.favourites_text_sent_at, contracts.last_braze_reminder, contracts.charge_total_pennies, contracts.charge_total_currency, contracts.invoice_id
        Recheck Cond: (((contracts.state)::text = 'active'::text) AND (contracts.start_datetime >= '2018-06-16 13:16:09.013943'::timestamp without time zone) AND (contracts.start_datetime <= '2018-06-16 17:16:09.015998'::timestamp without time zone))
        Heap Blocks: exact=241
        Buffers: shared hit=1345
        ->  BitmapAnd  (cost=866.14..866.14 rows=83 width=0) (actual time=21.065..21.065 rows=0 loops=1)
              Buffers: shared hit=1104
              ->  Bitmap Index Scan on index_contracts_on_state  (cost=0.00..274.20 rows=16059 width=0) (actual time=0.826..0.826 rows=5169 loops=1)
                    Index Cond: ((contracts.state)::text = 'active'::text)
                    Buffers: shared hit=89
              ->  Bitmap Index Scan on index_contracts_on_start_datetime  (cost=0.00..591.88 rows=40897 width=0) (actual time=19.655..19.655 rows=120496 loops=1)
                    Index Cond: ((contracts.start_datetime >= '2018-06-16 13:16:09.013943'::timestamp without time zone) AND (contracts.start_datetime <= '2018-06-16 17:16:09.015998'::timestamp without time zone))
                    Buffers: shared hit=1015
  ->  Hash  (cost=1706581.42..1706581.42 rows=1977439 width=4) (actual time=12293.957..12293.957 rows=6452571 loops=1)
        Output: contracts_1.id
        Buckets: 4194304 (originally 2097152)  Batches: 4 (originally 1)  Memory Usage: 89427kB
        Buffers: shared hit=509131 read=1014439 dirtied=7, temp written=14181
        I/O Timings: read=2096.216
        ->  Hash Join  (cost=257366.60..1706581.42 rows=1977439 width=4) (actual time=310.565..10273.581 rows=6452571 loops=1)
              Output: contracts_1.id
              Hash Cond: (contracts_1.role_id = roles.id)
              Join Filter: (NOT (hashed SubPlan 2))
              Rows Removed by Join Filter: 24861
              Buffers: shared hit=509130 read=1014439 dirtied=7
              I/O Timings: read=2096.216
              ->  Seq Scan on public.contracts contracts_1  (cost=112334.65..1542763.80 rows=3954878 width=12) (actual time=87.426..6966.694 rows=6477628 loops=1)
                    Output: contracts_1.role_id, contracts_1.candidate_id, contracts_1.id
                    Filter: (NOT (hashed SubPlan 1))
                    Rows Removed by Filter: 1718081
                    Buffers: shared hit=433016 read=1014439 dirtied=7
                    I/O Timings: read=2096.216
                    SubPlan 1
                      ->  Unique  (cost=112211.38..112331.29 rows=6730 width=4) (actual time=70.316..87.056 rows=934 loops=1)
                            Output: contracts_2.candidate_id
                            Buffers: shared hit=44710
                            ->  GroupAggregate  (cost=112211.38..112327.92 rows=6730 width=4) (actual time=70.315..86.895 rows=934 loops=1)
                                  Output: contracts_2.candidate_id
                                  Group Key: contracts_2.candidate_id
                                  Filter: (count(contracts_2.id) >= 20)
                                  Rows Removed by Filter: 4608
                                  Buffers: shared hit=44710
                                  ->  Sort  (cost=112211.38..112243.49 rows=64237 width=8) (actual time=70.288..77.274 rows=69565 loops=1)
                                        Output: contracts_2.candidate_id, contracts_2.id
                                        Sort Key: contracts_2.candidate_id
                                        Sort Method: quicksort  Memory: 6333kB
                                        Buffers: shared hit=44710
                                        ->  Index Scan using index_contracts_on_state on public.contracts contracts_2  (cost=0.11..111185.44 rows=64237 width=8) (actual time=0.021..53.546 rows=69565 loops=1)
                                              Output: contracts_2.candidate_id, contracts_2.id
                                              Index Cond: ((contracts_2.state)::text = ANY ('{archived,pending_approval,approved,auto_approved}'::text[]))
                                              Buffers: shared hit=44710
              ->  Hash  (cost=15876.96..15876.96 rows=58319 width=8) (actual time=50.103..50.103 rows=58362 loops=1)
                    Output: roles.id, roles.site_id
                    Buckets: 65536  Batches: 1  Memory Usage: 2792kB
                    Buffers: shared hit=15702
                    ->  Seq Scan on public.roles  (cost=0.00..15876.96 rows=58319 width=8) (actual time=0.012..39.442 rows=58362 loops=1)
                          Output: roles.id, roles.site_id
                          Buffers: shared hit=15702
              SubPlan 2
                ->  Unique  (cost=128533.34..128918.76 rows=64237 width=8) (actual time=152.182..171.761 rows=2772 loops=1)
                      Output: contracts_3.candidate_id, roles_1.site_id
                      Buffers: shared hit=60412
                      ->  GroupAggregate  (cost=128533.34..128854.52 rows=64237 width=8) (actual time=152.181..171.307 rows=2772 loops=1)
                            Output: contracts_3.candidate_id, roles_1.site_id
                            Group Key: contracts_3.candidate_id, roles_1.site_id
                            Filter: (count(roles_1.site_id) >= 5)
                            Rows Removed by Filter: 27469
                            Buffers: shared hit=60412
                            ->  Sort  (cost=128533.34..128565.46 rows=64237 width=8) (actual time=152.164..157.973 rows=69565 loops=1)
                                  Output: contracts_3.candidate_id, roles_1.site_id
                                  Sort Key: contracts_3.candidate_id, roles_1.site_id
                                  Sort Method: quicksort  Memory: 6333kB
                                  Buffers: shared hit=60412
                                  ->  Hash Join  (cost=16081.19..127507.40 rows=64237 width=8) (actual time=46.007..123.389 rows=69565 loops=1)
                                        Output: contracts_3.candidate_id, roles_1.site_id
                                        Hash Cond: (contracts_3.role_id = roles_1.id)
                                        Buffers: shared hit=60412
                                        ->  Index Scan using index_contracts_on_state on public.contracts contracts_3  (cost=0.11..111185.44 rows=64237 width=8) (actual time=0.023..52.781 rows=69565 loops=1)
                                              Output: contracts_3.candidate_id, contracts_3.role_id
                                              Index Cond: ((contracts_3.state)::text = ANY ('{archived,pending_approval,approved,auto_approved}'::text[]))
                                              Buffers: shared hit=44710
                                        ->  Hash  (cost=15876.96..15876.96 rows=58319 width=8) (actual time=45.910..45.910 rows=58362 loops=1)
                                              Output: roles_1.site_id, roles_1.id
                                              Buckets: 65536  Batches: 1  Memory Usage: 2792kB
                                              Buffers: shared hit=15702
                                              ->  Seq Scan on public.roles roles_1  (cost=0.00..15876.96 rows=58319 width=8) (actual time=0.007..36.585 rows=58362 loops=1)
                                                    Output: roles_1.site_id, roles_1.id
                                                    Buffers: shared hit=15702
Planning time: 1.448 ms
Execution time: 13367.009 ms

Contracts table has 8,300,000 rows with the schema;

-- Dumped from database version 10.3
-- Dumped by pg_dump version 10.3

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;

SET default_tablespace = '';

SET default_with_oids = false;

CREATE TABLE public.contracts (
    id integer NOT NULL,
    role_id integer,
    candidate_id integer,
    name character varying,
    internal_name character varying,
    description text,
    created_at timestamp without time zone NOT NULL,
    updated_at timestamp without time zone NOT NULL,
    start_date date,
    hourly_rate_pennies integer DEFAULT 0 NOT NULL,
    hourly_rate_currency character varying DEFAULT 'GBP'::character varying NOT NULL,
    state character varying DEFAULT 'pending'::character varying,
    first_seen timestamp without time zone,
    sent boolean DEFAULT false,
    waitlist boolean DEFAULT false,
    date_time_info character varying,
    dress_code character varying,
    why_work_here character varying,
    reason_for_cancelling text,
    reason_for_rejecting text,
    dismissed boolean DEFAULT false,
    calculable boolean DEFAULT true,
    archived boolean DEFAULT false,
    fixed_rate boolean DEFAULT false,
    cancelled_by_candidate_at timestamp without time zone,
    hourly_rate_invoice_pennies integer DEFAULT 0 NOT NULL,
    hourly_rate_invoice_currency character varying DEFAULT 'GBP'::character varying NOT NULL,
    bonus double precision,
    manual_push_notification_sent boolean,
    helpful_directions text,
    entry_point text,
    who_to_ask_for text,
    what_to_do_when_nothing_to_do text,
    standby_role_id integer,
    checked_in boolean DEFAULT false,
    checked_in_by_id integer,
    checked_in_at timestamp without time zone,
    checked_in_not_required boolean DEFAULT false,
    checked_in_not_required_by_id integer,
    checked_in_not_required_at timestamp without time zone,
    messaged boolean DEFAULT false,
    messaged_by_id integer,
    messaged_at timestamp without time zone,
    notes text,
    checked_in_by_type character varying,
    start_datetime timestamp without time zone,
    end_datetime timestamp without time zone,
    business_id integer,
    earliest_end_datetime timestamp without time zone,
    travel_time_minutes integer,
    start_time_hours integer,
    start_time_minutes integer,
    finish_time_hours integer,
    finish_time_minutes integer,
    rating integer,
    length_hours integer,
    break_hours integer,
    length_minutes integer,
    break_minutes integer,
    employer_rating integer,
    discount double precision DEFAULT 0.0,
    on_time boolean,
    feedback_from_candidate text,
    ignore_rating boolean DEFAULT false,
    feedback_department text,
    feedback_supervisor text,
    paid boolean DEFAULT false,
    payroll_run_id integer,
    paid_at timestamp without time zone,
    timesheet_recorded_at timestamp without time zone,
    submitted_shifts_count integer DEFAULT 1,
    blocked_at timestamp without time zone,
    favourited_at timestamp without time zone,
    candidate_favourited boolean DEFAULT false,
    employer_should_be_invoiced boolean DEFAULT false,
    pay_candidate_minimum boolean DEFAULT false,
    tier_number integer,
    feedback_best_thing character varying,
    feedback_worst_thing character varying,
    feedback_accurate_description boolean,
    candidate_responded_at timestamp without time zone,
    candidate_use_again_set_at timestamp without time zone,
    address text,
    latitude double precision,
    longitude double precision,
    instant_claim boolean DEFAULT false,
    additional_payment_id integer,
    favourites_text_sent_at timestamp without time zone,
    charge_total_pennies integer,
    charge_total_currency character varying DEFAULT 'GBP'::character varying NOT NULL,
    last_braze_reminder double precision,
    invoice_id integer
);


ALTER TABLE public.contracts OWNER TO james;

CREATE SEQUENCE public.contracts_id_seq
    AS integer
    START WITH 1
    INCREMENT BY 1
    NO MINVALUE
    NO MAXVALUE
    CACHE 1;


ALTER TABLE public.contracts_id_seq OWNER TO james;

ALTER SEQUENCE public.contracts_id_seq OWNED BY public.contracts.id;

ALTER TABLE ONLY public.contracts ALTER COLUMN id SET DEFAULT nextval('public.contracts_id_seq'::regclass);

ALTER TABLE ONLY public.contracts
    ADD CONSTRAINT contracts_pkey PRIMARY KEY (id);

CREATE INDEX contracts_composed_api_response_index ON public.contracts USING btree (candidate_id, state, start_date, archived);
CREATE UNIQUE INDEX contracts_role_id_candidate_id_key ON public.contracts USING btree (role_id, candidate_id);
CREATE INDEX index_contracts_on_calculable ON public.contracts USING btree (calculable);
CREATE INDEX index_contracts_on_cancelled_by_candidate_at ON public.contracts USING btree (cancelled_by_candidate_at);
CREATE INDEX index_contracts_on_candidate_id ON public.contracts USING btree (candidate_id);
CREATE INDEX index_contracts_on_candidate_id_and_business_id ON public.contracts USING btree (candidate_id, business_id);
CREATE INDEX index_contracts_on_candidate_id_and_candidate_responded_at ON public.contracts USING btree (candidate_id, candidate_responded_at);
CREATE INDEX index_contracts_on_candidate_id_and_state ON public.contracts USING btree (candidate_id, state);
CREATE INDEX index_contracts_on_checked_in ON public.contracts USING btree (checked_in);
CREATE INDEX index_contracts_on_checked_in_not_required ON public.contracts USING btree (checked_in_not_required);
CREATE INDEX index_contracts_on_created_at ON public.contracts USING btree (created_at);
CREATE INDEX index_contracts_on_earliest_end_datetime ON public.contracts USING btree (earliest_end_datetime);
CREATE INDEX index_contracts_on_end_datetime ON public.contracts USING btree (end_datetime);
CREATE INDEX index_contracts_on_paid ON public.contracts USING btree (paid);
CREATE INDEX index_contracts_on_role_id ON public.contracts USING btree (role_id);
CREATE INDEX index_contracts_on_role_id_and_state ON public.contracts USING btree (role_id, state);
CREATE INDEX index_contracts_on_start_datetime ON public.contracts USING btree (start_datetime);
CREATE INDEX index_contracts_on_state ON public.contracts USING btree (state);
CREATE INDEX index_contracts_on_state_and_start_date ON public.contracts USING btree (state, start_date);    

Roles table has 59,000 rows with the schema;

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;
SET default_tablespace = '';
SET default_with_oids = false;

CREATE TABLE public.roles (
    id integer NOT NULL,
    employer_id integer,
    search_id integer,
    name character varying,
    internal_name character varying,
    description text,
    number_to_hire integer,
    start_date date,
    available_now boolean,
    created_at timestamp without time zone NOT NULL,
    updated_at timestamp without time zone NOT NULL,
    address text DEFAULT ''::text,
    mobile_number text,
    site_id integer,
    state character varying DEFAULT 'draft'::character varying,
    date_time_info character varying,
    dress_code character varying,
    why_work_here character varying,
    contracts_last_sent timestamp without time zone,
    autosend boolean DEFAULT true,
    sent_batches_count integer DEFAULT 0,
    one_hour_expiry_sent boolean DEFAULT false,
    two_hour_expiry_sent boolean DEFAULT false,
    archived boolean DEFAULT false,
    job_type_id integer,
    industry_segment_id integer,
    hourly_rate_pennies integer,
    hourly_rate_currency character varying DEFAULT 'GBP'::character varying NOT NULL,
    latitude double precision,
    longitude double precision,
    is_in_grace_period boolean DEFAULT true,
    vip boolean DEFAULT false,
    checkin_text text,
    hourly_rate_invoice_pennies integer DEFAULT 0 NOT NULL,
    hourly_rate_invoice_currency character varying DEFAULT 'GBP'::character varying NOT NULL,
    bonus double precision,
    admin_user_id integer,
    helpful_directions text,
    entry_point text,
    who_to_ask_for text,
    what_to_do_when_nothing_to_do text,
    start_datetime timestamp without time zone,
    end_datetime timestamp without time zone,
    earliest_end_datetime timestamp without time zone,
    instant_book boolean,
    application_cut_off_date timestamp without time zone,
    applications_locked boolean DEFAULT false,
    send_counts json,
    overbooks integer DEFAULT 0,
    event_name character varying,
    tier_1_sent_at timestamp without time zone,
    tier_2_sent_at timestamp without time zone,
    tier_3_sent_at timestamp without time zone,
    favourites_can_instant_claim boolean DEFAULT true,
    supply_band integer DEFAULT 1,
    supply_band_1_sending boolean DEFAULT false,
    supply_band_2_sending boolean DEFAULT false,
    supply_band_3_sending boolean DEFAULT false,
    trial_shift boolean,
    max_reduction_in_attributes integer,
    attributes_reduced_at timestamp without time zone,
    checkin_sheet_id integer,
    personalised boolean DEFAULT false,
    hourly_rate_pennies_under_18 integer,
    hourly_rate_pennies_18_20 integer,
    hourly_rate_pennies_21_24 integer
);
ALTER TABLE public.roles OWNER TO james;

CREATE SEQUENCE public.roles_id_seq
    AS integer
    START WITH 1
    INCREMENT BY 1
    NO MINVALUE
    NO MAXVALUE
    CACHE 1;


ALTER TABLE public.roles_id_seq OWNER TO james;
ALTER SEQUENCE public.roles_id_seq OWNED BY public.roles.id;
ALTER TABLE ONLY public.roles ALTER COLUMN id SET DEFAULT nextval('public.roles_id_seq'::regclass);
ALTER TABLE ONLY public.roles
    ADD CONSTRAINT roles_pkey PRIMARY KEY (id);
CREATE INDEX index_roles_on_checkin_sheet_id ON public.roles USING btree (checkin_sheet_id);
CREATE INDEX index_roles_on_earliest_end_datetime ON public.roles USING btree (earliest_end_datetime);
CREATE INDEX index_roles_on_employer_id ON public.roles USING btree (employer_id);
CREATE INDEX index_roles_on_end_datetime ON public.roles USING btree (end_datetime);
CREATE INDEX index_roles_on_job_type_id ON public.roles USING btree (job_type_id);
CREATE INDEX index_roles_on_personalised ON public.roles USING btree (personalised);
CREATE INDEX index_roles_on_search_id ON public.roles USING btree (search_id);
CREATE INDEX index_roles_on_site_id ON public.roles USING btree (site_id);
CREATE INDEX index_roles_on_start_datetime ON public.roles USING btree (start_datetime);

Sites table has 5,900 rows with the schema;

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;
SET default_tablespace = '';
SET default_with_oids = false;

CREATE TABLE public.sites (
    id integer NOT NULL,
    business_id integer,
    created_at timestamp without time zone NOT NULL,
    updated_at timestamp without time zone NOT NULL,
    address text DEFAULT ''::text,
    require_business_approval boolean DEFAULT false,
    city_id integer,
    show_insights boolean DEFAULT false,
    requires_overbooks boolean DEFAULT true,
    minimum_shift_length integer DEFAULT 4,
    invoice_method text,
    checkin_sheet_receiver_emails character varying[] DEFAULT '{}'::character varying[],
    block_booking_allowed boolean DEFAULT false,
    business_unit character varying,
    kam_status character varying DEFAULT 'live'::character varying,
    kam_status_data json,
    invoice_group integer DEFAULT 0,
    invoice_week_end_day integer DEFAULT 0
);


ALTER TABLE public.sites OWNER TO james;

CREATE SEQUENCE public.sites_id_seq
    AS integer
    START WITH 1
    INCREMENT BY 1
    NO MINVALUE
    NO MAXVALUE
    CACHE 1;

ALTER TABLE public.sites_id_seq OWNER TO james;
ALTER SEQUENCE public.sites_id_seq OWNED BY public.sites.id;
ALTER TABLE ONLY public.sites ALTER COLUMN id SET DEFAULT nextval('public.sites_id_seq'::regclass);
ALTER TABLE ONLY public.sites
    ADD CONSTRAINT sites_pkey PRIMARY KEY (id);

Postgres version 10.3

This is the solution I ended up with:

SELECT contracts.* 
FROM contracts 
INNER JOIN roles ON roles.id = contracts.role_id 
WHERE (
  contracts.start_datetime BETWEEN (CURRENT_TIMESTAMP - INTERVAL '2 hour') AND (CURRENT_TIMESTAMP + INTERVAL '2 hour')
) AND (
  contracts.candidate_id NOT IN (
    SELECT c2.candidate_id
    FROM contracts c2
    JOIN roles r2 ON (r2.id = c2.role_id)
    WHERE c2.state IN ('archived', 'pending_approval', 'approved', 'auto_approved')
      AND (c2.candidate_id = contracts.candidate_id)
    GROUP BY c2.candidate_id
    HAVING
    (    COUNT(DISTINCT c2.id) >= 20
      OR COUNT(DISTINCT CASE WHEN r2.site_id = roles.site_id THEN c2.id END) >= 5
    )
  )
) AND contracts.state = 'active'

Best Answer

The query below uses EXISTS and NOT EXISTS instead of IN and NOT IN.
I think it should give the results you're aiming for.

But I could only test it on a small set of sample data.
So you'd have to test yourself :
1) if it's correct
2) if it has better performance

Also note that the timestamp criteria aren't hardcoded in it.
But you can just replace those with hardcoded values.
(And remove the TST_ from the table names)

SELECT c.* 
FROM TST_CONTRACTS c
WHERE c.START_DATETIME BETWEEN (CURRENT_TIMESTAMP - INTERVAL '4 hour') AND (CURRENT_TIMESTAMP - INTERVAL '1 hour')
AND c.STATE = 'ACTIVE' 
AND EXISTS (
    SELECT r.SITE_ID
    FROM TST_ROLES r
    WHERE (r.ID = c.ROLE_ID)
    AND NOT EXISTS (
      SELECT c2.CANDIDATE_ID
      FROM TST_CONTRACTS c2
      JOIN TST_ROLES r2 ON (r2.ID = c2.ROLE_ID)
      WHERE c2.STATE IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED') 
        AND (c2.CANDIDATE_ID = c.CANDIDATE_ID)
      GROUP BY c2.CANDIDATE_ID
      HAVING 
      (    COUNT(DISTINCT c2.ID) >= 20
        OR COUNT(DISTINCT CASE WHEN r2.SITE_ID = r.SITE_ID THEN c2.ID END) >= 5
      )
   )
);

Sample data:

CREATE TABLE TST_CONTRACTS (ID INT, ROLE_ID INT, CANDIDATE_ID INT, STATE VARCHAR(30), START_DATETIME TIMESTAMP);
CREATE TABLE TST_ROLES (ID INT, SITE_ID INT);

INSERT INTO TST_CONTRACTS (ID, ROLE_ID, CANDIDATE_ID, STATE, START_DATETIME) VALUES
 (51,11,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(52,12,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(11,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(12,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(13,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(14,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(21,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(22,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(23,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(24,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(25,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(26,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(27,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(28,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(29,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(30,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(31,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(32,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(33,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(34,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(35,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(36,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
;

INSERT INTO TST_ROLES (ID, SITE_ID) VALUES
 (11,1001),(12,1002),(13,1002)
;