Based on a simple innodb mysql 5.6 database schema like this:
What is the optimal indexing strategy for querying the "info" table where most of the search criteria will be IN and BETWEEN conditions? Here are some constraints/info on the possible queries:
- The date_created range will always be a part of the queries (BETWEEN);
- The user_id and the info_category (JOIN) conditions are optional;
- The results can be ordered by either the date_created or the view_count fields;
Here are some example queries to be executed:
#Fetch 100 active info based on date_created, user and info_type.
#Ordered by date_created and offset 100000
SELECT i.id,
i.title,
i.view_count
FROM info i
WHERE i.date_created BETWEEN '2016-01-08 11:45:32' AND '2017-01-27 11:45:32'
AND i.info_type_id IN (1,2,3,23)
AND i.user_id IN (1,5,120,387,45023) #optional
AND i.id > 100000
AND i.date_deleted IS NULL
ORDER BY i.date_created DESC
LIMIT 100;
#Fetch 100 active info based on date_created, info_type.
#Ordered by view_count and offset 100000
SELECT i.id,
i.title,
i.view_count
FROM info i
WHERE i.date_created BETWEEN '2016-01-08 11:45:32' AND '2017-01-27 11:45:32'
AND i.info_type_id IN (1,2,3,23)
AND i.user_id IN (1,5,120,387,45023) #optional
AND i.id > 100000
AND i.date_deleted IS NULL
ORDER BY i.view_count DESC
LIMIT 100;
#Fetch 100 active info based on date_created, info_type, user and category.
#Ordered by view_count and offset 100000
SELECT i.id,
i.title,
i.view_count
FROM info i
JOIN info_category ic on ic.info_id = i.id AND ic.category_id IN (1,2,3,4,10)
WHERE i.date_created BETWEEN '2016-01-28 11:45:32' AND '2017-01-27 11:45:32'
AND i.info_type_id IN (1,2,3,23)
AND i.user_id IN (1,5,120,387,45023) #optional
AND i.id > 500000
AND i.date_deleted IS NULL
GROUP BY i.id
ORDER BY i.view_count DESC
LIMIT 100;
Here is the schema creation sql:
--
-- Schema test
--
DROP SCHEMA IF EXISTS `test` ;
CREATE SCHEMA IF NOT EXISTS `test` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ;
SHOW WARNINGS;
--
-- Set default database
--
USE test;
--
-- Definition for table info_type
--
DROP TABLE IF EXISTS info_type;
CREATE TABLE info_type (
id tinyint(1) UNSIGNED NOT NULL AUTO_INCREMENT,
name varchar(20) NOT NULL,
PRIMARY KEY (id)
)
ENGINE = INNODB
CHARACTER SET utf8
COLLATE utf8_general_ci;
--
-- Definition for table category
--
DROP TABLE IF EXISTS category;
CREATE TABLE category (
id int(10) UNSIGNED NOT NULL AUTO_INCREMENT,
name varchar(20) NOT NULL,
PRIMARY KEY (id)
)
ENGINE = INNODB
AUTO_INCREMENT = 1
CHARACTER SET utf8
COLLATE utf8_general_ci;
--
-- Definition for table user
--
DROP TABLE IF EXISTS user;
CREATE TABLE user (
id int(10) UNSIGNED NOT NULL AUTO_INCREMENT,
name varchar(32) NOT NULL,
PRIMARY KEY (id)
)
ENGINE = INNODB
AUTO_INCREMENT = 1
CHARACTER SET utf8
COLLATE utf8_general_ci;
--
-- Definition for table info
--
DROP TABLE IF EXISTS info;
CREATE TABLE info (
id int(10) UNSIGNED NOT NULL AUTO_INCREMENT,
title varchar(255) NOT NULL,
date_created datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
user_id int(10) UNSIGNED NOT NULL,
info_type_id tinyint(3) UNSIGNED NOT NULL,
date_deleted datetime DEFAULT NULL,
view_count int(11) UNSIGNED NOT NULL DEFAULT 0,
PRIMARY KEY (id),
INDEX date_created_idx (date_created),
INDEX fk_info_type_id_idx (info_type_id),
INDEX user_id_idx (user_id),
CONSTRAINT fk_info_type_id FOREIGN KEY (info_type_id)
REFERENCES info_type (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
CONSTRAINT fk_user_id FOREIGN KEY (user_id)
REFERENCES user (id) ON DELETE CASCADE ON UPDATE NO ACTION
)
ENGINE = INNODB
AUTO_INCREMENT = 1
CHARACTER SET utf8
COLLATE utf8_general_ci;
--
-- Definition for table info_category
--
DROP TABLE IF EXISTS info_category;
CREATE TABLE info_category (
info_id INT(10) UNSIGNED NOT NULL,
category_id INT(10) UNSIGNED NOT NULL,
PRIMARY KEY (info_id, category_id),
INDEX k_category_id_idx (category_id),
CONSTRAINT fk_info_id FOREIGN KEY (info_id)
REFERENCES info(id) ON DELETE CASCADE ON UPDATE NO ACTION,
CONSTRAINT fk_category_id FOREIGN KEY (category_id)
REFERENCES category(id) ON DELETE CASCADE ON UPDATE NO ACTION
)
ENGINE = INNODB
CHARACTER SET utf8
COLLATE utf8_general_ci;
Here is a script to create dummy data (beware, took almost 3 hours on my crappy VM) :
INSERT INTO info_type(name)
VALUES ('news'), ('announcement'), ('advice'), ('memo'), ('historic');
DELIMITER //
DROP PROCEDURE IF EXISTS createFakeData//
CREATE PROCEDURE createFakeData ()
BEGIN
DECLARE userCount int default 50000;
DECLARE infoCount int default 1000000;
DECLARE categoryCount int DEFAULT 1000;
DECLARE infoCategoryCount int;
DECLARE i int DEFAULT 1;
DECLARE j int;
DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' BEGIN END;
#create fake users
WHILE i < userCount DO
INSERT INTO user(name) VALUES (CONCAT('user_', i));
SET i := i + 1;
END WHILE;
SET i = 1;
#create fake infos
WHILE i < infoCount DO
INSERT INTO info(title,
date_created,
user_id,
info_type_id,
view_count)
VALUES (CONCAT('title_', i),
DATE_ADD(NOW(), INTERVAL -(FLOOR(RAND() * 365) + 1) DAY),
FLOOR(RAND() * userCount) + 1,
FLOOR(RAND() * 5) + 1,
FLOOR(RAND() * 10000) + 1);
SET i := i + 1;
END WHILE;
SET i = 1;
#create fake categories
WHILE i < categoryCount DO
INSERT INTO category(name) VALUES (CONCAT('category_', i));
SET i = i + 1;
END WHILE;
SET i = 1;
#create fake info-category associations
WHILE i < infoCount DO
SET infoCategoryCount = FLOOR(RAND() * 10) + 1;
SET j = 0;
WHILE j < infoCategoryCount DO
INSERT INTO info_category (info_id, category_id)
VALUES (i, FLOOR(RAND() * categoryCount) + 1);
SET j = j + 1;
END WHILE;
SET i = i + 1;
END WHILE;
END//
DELIMITER ;
CALL createFakeData();
DROP PROCEDURE createFakeData;
Best Answer
It is essentially impossible to index on two ranges.
Your comment about "offset ... order by ..." does not make sense because the offset is based on one column, the order by, another.
The first query is best done with simply
which handles one of the ranges, plus the
ORDER BY
, thereby doing some filtering and avoiding a sort while being able to stop before the end of the table (if 100 rows are found).The other two are not that clear. Either have an index on the
ORDER BY
column so that sorting and limiting are handled, and/or have an index on whichever filtering column is the most selective.The
JOIN
in the third makes it even more difficult to optimize. The Optimizer can't know whether to start withic
or withi
; both have filtering.