PostgreSQL Performance – How to Speed Up ORDER BY with Text Column

postgresql

Table schema:

CREATE SEQUENCE fsa_online_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 9223372036854775807 START 1 CACHE 1;

CREATE TABLE "public"."fsa_online" (
    "id" integer DEFAULT nextval('fsa_online_id_seq') NOT NULL,
    "fsa_uuid" uuid NOT NULL,
    "use_version" integer,
    "last_original_version" integer,
    "created_at" timestamp(0),
    "updated_at" timestamp(0),
    "deleted_at" timestamp(0),
    "is_drug" boolean DEFAULT true NOT NULL,
    CONSTRAINT "fsa_online_fsa_uuid_unique" UNIQUE ("fsa_uuid"),
    CONSTRAINT "fsa_online_pkey" PRIMARY KEY ("id")
) WITH (oids = false);

CREATE INDEX "fsa_online_is_drug_index" ON "public"."fsa_online" USING btree ("is_drug");
CREATE INDEX "fsa_online_last_original_version_index" ON "public"."fsa_online" USING btree ("last_original_version");
CREATE INDEX "fsa_online_use_version_index" ON "public"."fsa_online" USING btree ("use_version");


CREATE SEQUENCE fsa_online_data_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 9223372036854775807 START 1 CACHE 1;

CREATE TABLE "public"."fsa_online_data" (
    "id" integer DEFAULT nextval('fsa_online_data_id_seq') NOT NULL,
    "fsa_id" integer NOT NULL,
    "reason" text,
    "is_original" boolean NOT NULL,
    "is_published" boolean DEFAULT true NOT NULL,
    "created_by_id" integer,
    "created_at" timestamp(0),
    "unparsed_data" jsonb,
    "raw_id" integer NOT NULL,
    "status_id" integer,
    "type_id" integer,
    "reg_num" character varying(255),
    "start_date" date,
    "end_date" date,
    "docs" jsonb,
    "docs_add" text,
    "scheme" text,
    "free_form" jsonb,
    "fio_expert" text,
    "lab_info" jsonb,
    "change_info" jsonb,
    "applicant_info" jsonb,
    "manufacturer_info" jsonb,
    "product_info" jsonb,
    "standard_info" jsonb,
    "manufacturer_tbl_info" jsonb,
    "product_tbl_info" jsonb,
    "certification_info" jsonb,
    "trade_name" text,
    "cert_num" character varying(255),
    "man_form_txt" text,
    "manufacturer_name" text,
    "man_country_id" integer,
    "serial_num" character varying(255),
    "serial_size" integer,
    "barcode" character varying(255),
    "barcode_type_id" integer,
    "is_cert" boolean NOT NULL,
    "original_data" jsonb,
    CONSTRAINT "fsa_online_data_pkey" PRIMARY KEY ("id"),
    CONSTRAINT "fsa_online_data_fsa_id_foreign" FOREIGN KEY (fsa_id) REFERENCES fsa_online(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE
) WITH (oids = false);

CREATE INDEX "fsa_online_data_barcode_index" ON "public"."fsa_online_data" USING btree ("barcode");
CREATE INDEX "fsa_online_data_barcode_type_id_index" ON "public"."fsa_online_data" USING btree ("barcode_type_id");
CREATE INDEX "fsa_online_data_cert_num_index" ON "public"."fsa_online_data" USING btree ("cert_num");
CREATE INDEX "fsa_online_data_created_by_id_index" ON "public"."fsa_online_data" USING btree ("created_by_id");
CREATE INDEX "fsa_online_data_end_date_index" ON "public"."fsa_online_data" USING btree ("end_date");
CREATE INDEX "fsa_online_data_fsa_id_index" ON "public"."fsa_online_data" USING btree ("fsa_id");
CREATE INDEX "fsa_online_data_is_cert_index" ON "public"."fsa_online_data" USING btree ("is_cert");
CREATE INDEX "fsa_online_data_man_country_id_index" ON "public"."fsa_online_data" USING btree ("man_country_id");
CREATE INDEX "fsa_online_data_raw_id_index" ON "public"."fsa_online_data" USING btree ("raw_id");
CREATE INDEX "fsa_online_data_reg_num_index" ON "public"."fsa_online_data" USING btree ("reg_num");
CREATE INDEX "fsa_online_data_start_date_index" ON "public"."fsa_online_data" USING btree ("start_date");
CREATE INDEX "fsa_online_data_status_id_index" ON "public"."fsa_online_data" USING btree ("status_id");
CREATE INDEX "fsa_online_data_trade_name_index" ON "public"."fsa_online_data" USING btree ("trade_name");
CREATE INDEX "fsa_online_data_type_id_index" ON "public"."fsa_online_data" USING btree ("type_id");

ALTER TABLE "fsa_online" ADD CONSTRAINT "fsa_online_last_original_version_foreign" FOREIGN KEY (last_original_version) REFERENCES fsa_online_data(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE;
ALTER TABLE "fsa_online" ADD CONSTRAINT "fsa_online_use_version_foreign" FOREIGN KEY (use_version) REFERENCES fsa_online_data(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE;

The "fsa_online" table contains about 800 000 records (possible grow up to 3 million records)

The "fsa_online_data" table contains about 3.5 million records (possible grow up to 15-25 million records)

I have the following query:

select 
  "fsa_online"."id", 
  "fsa_online"."fsa_uuid", 
  "fsa_online"."use_version", 
  "fsa_online"."last_original_version", 
  "fsa_online"."is_drug" 
from 
  "fsa_online" 
  inner join "fsa_online_data" as "data" on "data"."id" = CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END 
where 
  "unparsed_data" is not null 
  and "fsa_online"."deleted_at" is null 
limit 10 offset 0

It took about 150 ms time.

But when I need to sort data, for example by "cert_num" column of the "fsa_online_data" table, it took an extremely long time (about 63000 ms).

select 
  "fsa_online"."id", 
  "fsa_online"."fsa_uuid", 
  "fsa_online"."use_version", 
  "fsa_online"."last_original_version", 
  "fsa_online"."is_drug" 
from 
  "fsa_online" 
  inner join "fsa_online_data" as "data" on "data"."id" = CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END 
where 
  "unparsed_data" is not null 
  and "fsa_online"."deleted_at" is null 
ORDER BY "data"."cert_num"
limit 10 offset 0

I did a little trick by adding COLLATE "C" (but that is not correct, I really need to compare strings as Unicode strings) to ORDER BY statement and query took about 27500 ms.

Here is a query plan:

"Limit  (cost=1037857.75..1037857.78 rows=10 width=545)"
"  Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, ((data.cert_num)::character varying(255))"
"  ->  Sort  (cost=1037857.75..1039085.55 rows=491120 width=545)"
"        Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, ((data.cert_num)::character varying(255))"
"        Sort Key: ((data.cert_num)::character varying(255)) COLLATE "C""
"        ->  Hash Join  (cost=972267.98..1027244.83 rows=491120 width=545)"
"              Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, data.cert_num"
"              Inner Unique: true"
"              Hash Cond: (CASE WHEN (fsa_online.use_version IS NULL) THEN fsa_online.last_original_version ELSE fsa_online.use_version END = data.id)"
"              ->  Seq Scan on public.fsa_online  (cost=0.00..19143.06 rows=899706 width=29)"
"                    Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug"
"                    Filter: (fsa_online.deleted_at IS NULL)"
"              ->  Hash  (cost=934126.84..934126.84 rows=2077451 width=17)"
"                    Output: data.cert_num, data.id"
"                    ->  Seq Scan on public.fsa_online_data data  (cost=0.00..934126.84 rows=2077451 width=17)"
"                          Output: data.cert_num, data.id"
"                          Filter: (data.unparsed_data IS NOT NULL)"

Also, I need to perform queries using LIKE statements on different text columns (full-text search is not good for this case, because text columns contain an arbitrary data, like serial numbers).

select count(*)
from "fsa_online" 
inner join "fsa_online_data" as "data" on "data"."id" = 
CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END 

The result is about 900 000 records.
The "fsa_online_data" table size is 7GB.

Hardware list:

  • Samsung SSD EVO 850
  • Intel Core i7 6700k
  • 16 GB DDR4 RAM

Tested on PostgreSQL versions: 9.6.9 and 10.5

How can I improve the performance of ORDER BY operations by text columns?
I want to reduce query execution time to 200-300ms.

Best Answer

The CASE in a join condition is killing you.

You should probably reorg your data so that use_version always contains the version to be used. If you can't do that, then create an expression index on the CASE expression:

create index on fsa_online ((CASE WHEN use_version IS NULL 
    THEN last_original_version ELSE use_version END));