Mysql – Create index for multiple WHERE EXISTS subqueries

indexMySQLperformancequery-performancesubquery

I have to find an efficient way to write a very specific query for an application that I'm developing.

The idea is the following: Find all the genetic markers for which there exists at least one plant line per species for which the genotype value is "1".

So I've got, say, 3 plant species: "species_1", "species_2" and "species_3".

I've got these tables: accessions (plant lines), markers (the genetic markers), genotypes (the genotype; one per marker, accession and dataset), datasets (to subdivide genotype values into datasets) and taxonomies (the plant species).

I've come up with this query:

SELECT
    markers.*, 1 AS "species_1",
    1 AS "species_2",
    1 AS "species_3"
FROM
    markers
LEFT JOIN genotypes g ON g.marker_id = markers.id
WHERE
    g.dataset_id = 3
AND EXISTS (
    SELECT
        1
    FROM
        genotypes
    LEFT JOIN accession ON genotypes.accession_id = accession.id
    LEFT JOIN taxonomies ON taxonomies.id = accession.taxonomy_id
    WHERE
        genotypes.marker_id = markers.id
    AND genotypes.a = 1
    AND taxonomies.genus = "genus1"
    AND taxonomies.species = "species1"
)
AND EXISTS (
    SELECT
        1
    FROM
        genotypes
    LEFT JOIN accession ON genotypes.accession_id = accession.id
    LEFT JOIN taxonomies ON taxonomies.id = accession.taxonomy_id
    WHERE
        genotypes.marker_id = markers.id
    AND genotypes.a = 1
    AND taxonomies.genus = "genus2"
    AND taxonomies.species = "species2"
)
AND EXISTS (
    SELECT
        1
    FROM
        genotypes
    LEFT JOIN accession ON genotypes.accession_id = accession.id
    LEFT JOIN taxonomies ON taxonomies.id = accession.taxonomy_id
    WHERE
        genotypes.marker_id = markers.id
    AND genotypes.a = 1
    AND taxonomies.genus = "genus3"
    AND taxonomies.species = "species3"
)

NOTE: If you're wondering about the SELECT 1 AS bits, this is only because I also run the query for 0 and NOT EXISTS. So this is only one example query.

It returns the correct result, but is quite slow on a genotypes table with roughtly 2 million rows.

I've had a look at EXPLAIN of the query and tried to generate indices so that each sub-query has an index to use. The index that did improve performance was on genotypes.marker_id, genotypes.dataset_id, genotypes.a.

However, the query takes around 7-8 seconds to return the result and I think it could run faster if only I can figure out which other index to use or how to restructure the query.

Does anyone have any suggestions as to how to improve performance?


EDIT:

CREATE TABLE `genotypes` (
    `id` BIGINT (20) NOT NULL AUTO_INCREMENT,
    `marker_id` INT (11) NOT NULL,
    `dataset_id` INT (11) NOT NULL,
    `accession_id` INT (11) NOT NULL,
    `a` VARCHAR (3) DEFAULT NULL,
    PRIMARY KEY (`id`),
    KEY `marker_id` (`marker_id`) USING BTREE,
    KEY `dataset_id` (`dataset_id`) USING BTREE,
    KEY `accession_id` (`accession_id`) USING BTREE,
    KEY `genotypes_marker_dataset_allele1` (
        `marker_id`,
        `dataset_id`,
        `a`
    ) USING BTREE,
    KEY `genotypes_marker_allele1` (`marker_id`, `a`) USING BTREE,
    CONSTRAINT `genotypes_ibfk_1` FOREIGN KEY (`marker_id`) REFERENCES `markers` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
    CONSTRAINT `genotypes_ibfk_2` FOREIGN KEY (`dataset_id`) REFERENCES `datasets` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
    CONSTRAINT `genotypes_ibfk_3` FOREIGN KEY (`accession_id`) REFERENCES `accessions` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = INNODB AUTO_INCREMENT = 2007206 DEFAULT CHARSET = latin1

CREATE TABLE `markers` (
    `id` INT (11) NOT NULL AUTO_INCREMENT,
    `marker_name` VARCHAR (45) DEFAULT NULL,
    PRIMARY KEY (`id`)
) ENGINE = INNODB AUTO_INCREMENT = 8665 DEFAULT CHARSET = latin1

CREATE TABLE `accessions` (
    `id` INT (11) NOT NULL AUTO_INCREMENT,
    `name` VARCHAR (255) NOT NULL,
    `taxonomy_id` INT (11) DEFAULT NULL,
    PRIMARY KEY (`id`),
    KEY `taxonomy_id` (`taxonomy_id`) USING BTREE,
    CONSTRAINT `accessions_ibfk_3` FOREIGN KEY (`taxonomy_id`) REFERENCES `taxonomies` (`id`) ON DELETE NO ACTION ON UPDATE CASCADE,

) ENGINE = INNODB AUTO_INCREMENT = 2304 DEFAULT CHARSET = latin1

CREATE TABLE `taxonomies` (
    `id` INT (11) NOT NULL AUTO_INCREMENT,
    `genus` VARCHAR (255) NOT NULL DEFAULT '',
    `species` VARCHAR (255) NOT NULL DEFAULT '',
    PRIMARY KEY (`id`),
    KEY `taxonomies_genus` (`genus`) USING BTREE,
    KEY `taxonomies_species` (`species`) USING BTREE,
    KEY `taxonomies_genus_species` (`genus`, `species`) USING BTREE
) ENGINE = INNODB AUTO_INCREMENT = 103 DEFAULT CHARSET = latin1

id  select_type table   type    possible_keys   key key_len ref rows    Extra
1   PRIMARY markers ALL PRIMARY             8803    Using where; Using temporary
1   PRIMARY g   ref marker_id,dataset_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1  genotypes_marker_dataset_allele1    8   markers.id,const    104 Using index; Distinct
6   DEPENDENT SUBQUERY  taxonomies  ref PRIMARY,taxonomies_genus,taxonomies_species,taxonomies_genus_species    taxonomies_species  257 const   1   Using index condition; Using where
6   DEPENDENT SUBQUERY  genotypes   ref marker_id,accession_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1    marker_id   4   markers.id  115 Using where
6   DEPENDENT SUBQUERY  accessions  eq_ref  PRIMARY,taxonomy_id PRIMARY 4   genotypes.accession_id  1   Using where
5   DEPENDENT SUBQUERY  taxonomies  ref PRIMARY,taxonomies_genus,taxonomies_species,taxonomies_genus_species    taxonomies_species  257 const   1   Using index condition; Using where
5   DEPENDENT SUBQUERY  genotypes   ref marker_id,accession_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1    marker_id   4   markers.id  115 Using where
5   DEPENDENT SUBQUERY  accessions  eq_ref  PRIMARY,taxonomy_id PRIMARY 4   genotypes.accession_id  1   Using where
4   DEPENDENT SUBQUERY  taxonomies  ref PRIMARY,taxonomies_genus,taxonomies_species,taxonomies_genus_species    taxonomies_species  257 const   1   Using index condition; Using where
4   DEPENDENT SUBQUERY  genotypes   ref marker_id,accession_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1    marker_id   4   markers.id  115 Using where
4   DEPENDENT SUBQUERY  accessions  eq_ref  PRIMARY,taxonomy_id PRIMARY 4   genotypes.accession_id  1   Using where
3   DEPENDENT SUBQUERY  taxonomies  ref PRIMARY,taxonomies_genus,taxonomies_species,taxonomies_genus_species    taxonomies_species  257 const   1   Using index condition; Using where
3   DEPENDENT SUBQUERY  genotypes   ref marker_id,accession_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1    marker_id   4   markers.id  115 Using where
3   DEPENDENT SUBQUERY  accessions  eq_ref  PRIMARY,taxonomy_id PRIMARY 4   genotypes.accession_id  1   Using where
2   DEPENDENT SUBQUERY  taxonomies  ref PRIMARY,taxonomies_genus,taxonomies_species,taxonomies_genus_species    taxonomies_species  257 const   1   Using index condition; Using where
2   DEPENDENT SUBQUERY  genotypes   ref marker_id,accession_id,genotypes_marker_dataset_allele1,genotypes_marker_allele1    marker_id   4   markers.id  115 Using where
2   DEPENDENT SUBQUERY  accessions  eq_ref  PRIMARY,taxonomy_id PRIMARY 4   genotypes.accession_id  1   Using where

Best Answer

I'd try an index on (a, marker_id, accession_id) and on (a, accession_id, marker_id). And since that a is a varchar, use string literals, not numbers: where a = '1'. If the stored values are only numbers, then convert it from varchar to an appropriate number type (tinyiny, smallint). - ypercube